diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml new file mode 100644 index 00000000..d54d09d5 --- /dev/null +++ b/.github/FUNDING.yml @@ -0,0 +1,3 @@ +# These are supported funding model platforms + +patreon: 0xAX diff --git a/.github/ISSUE_TEMPLATE/content-issue.yml b/.github/ISSUE_TEMPLATE/content-issue.yml new file mode 100644 index 00000000..ae88eed8 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/content-issue.yml @@ -0,0 +1,39 @@ +name: 📖 Content issue +description: Report an issue with the content +body: + - type: markdown + attributes: + value: | + Use this form to report an issue with the content. + + When contributing, make sure to follow Contributing guidelines and Code of Conduct. + Thank you for your contribution! + + - type: checkboxes + attributes: + label: Existing issues + description: Is there an existing issue for this? Search open and closed issues to avoid duplicates. + options: + - label: I have searched the existing issues. + required: true + + - type: input + attributes: + label: Affected document + description: Name or paste a link to the document that contains an issue. + validations: + required: true + + - type: textarea + attributes: + label: Issue description + description: Explain what is unclear or confusing in the given document. + validations: + required: true + + - type: textarea + attributes: + label: Attachments + description: Include screenshots or links if applicable. + validations: + required: false diff --git a/.github/ISSUE_TEMPLATE/question.yml b/.github/ISSUE_TEMPLATE/question.yml new file mode 100644 index 00000000..a1255e1e --- /dev/null +++ b/.github/ISSUE_TEMPLATE/question.yml @@ -0,0 +1,39 @@ +name: ❓ Questions and discussions +description: Ask a question or start a discussion with other community members. +body: + - type: markdown + attributes: + value: | + Use this form to ask a question or start a discussion with other community members. + + When contributing, make sure to follow Contributing guidelines and Code of Conduct. + Thank you for your contribution! + + - type: checkboxes + attributes: + label: Existing issues + description: Is there an existing issue for this? Search open and closed issues to avoid duplicates. + options: + - label: I have searched the existing issues. + required: true + + - type: textarea + attributes: + label: Question + description: Ask a question you would like to discuss with the community. + validations: + required: false + + - type: textarea + attributes: + label: Discussion + description: Start a discussion topic. + validations: + required: false + + - type: textarea + attributes: + label: Attachments + description: Include screenshots, links, or example's output if applicable. + validations: + required: false diff --git a/.github/dependabot.yaml b/.github/dependabot.yaml new file mode 100644 index 00000000..12301490 --- /dev/null +++ b/.github/dependabot.yaml @@ -0,0 +1,6 @@ +version: 2 +updates: + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "daily" diff --git a/.github/pull-request-template.md b/.github/pull-request-template.md new file mode 100644 index 00000000..ae2a5cf7 --- /dev/null +++ b/.github/pull-request-template.md @@ -0,0 +1,18 @@ + + +**Description** + + + +Changes proposed in this pull request: + +- ... +- ... +- ... + +**Related issues** + + diff --git a/.github/workflows/check-code-snippets.yaml b/.github/workflows/check-code-snippets.yaml new file mode 100644 index 00000000..3aa534a5 --- /dev/null +++ b/.github/workflows/check-code-snippets.yaml @@ -0,0 +1,32 @@ +name: check code snippets + +on: + workflow_dispatch: + push: + branches: + - main + pull_request: + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +jobs: + check-code-snippets: + name: check-code-snippets + runs-on: + - ubuntu-22.04 + steps: + - name: Checkout repository + uses: actions/checkout@v5 + - name: Setup python + uses: actions/setup-python@v6 + with: + python-version: '3.13' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install requests + - name: Validate code snippets + run: | + python ./scripts/check_code_snippets.py . diff --git a/.github/workflows/generate-e-books.yaml b/.github/workflows/generate-e-books.yaml new file mode 100644 index 00000000..84cb9c7b --- /dev/null +++ b/.github/workflows/generate-e-books.yaml @@ -0,0 +1,63 @@ +name: Generate e-books + +on: + pull_request: + branches: + - master + workflow_dispatch: {} # For manual runs. + +jobs: + build-for-pr: + # For every PR, build the same artifacts and make them accessible from the PR. + if: github.event_name == 'pull_request' + runs-on: ubuntu-latest + + permissions: + contents: read + pull-requests: write + + steps: + - name: Checkout repository + uses: actions/checkout@v5 + + - name: Export all supported book formats from the Docker container + run: | + make run + make export + + - name: Copy generated files to host system + run: | + make cp + mkdir -p artifacts/ + mv "Linux Inside - 0xAX.epub" \ + "Linux Inside - 0xAX.mobi" \ + "Linux Inside - 0xAX.pdf" \ + "Linux Inside - 0xAX (A5).pdf" \ + artifacts/ + + - name: Upload PR artifacts + uses: actions/upload-artifact@v4 + with: + name: ebooks-${{ github.sha }} + path: artifacts/* + if-no-files-found: error + # Change the retention period here if necessary. + retention-days: 7 + + - name: Add a comment with a link to the generated artifacts. + # For forked PRs the token is read-only; skip commenting to avoid failures. + if: ${{ github.event.pull_request.head.repo.full_name == github.event.pull_request.base.repo.full_name }} + uses: actions/github-script@v8 + env: + RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + with: + script: | + const body = [ + `E-books generated for this pull request available at: ${process.env.RUN_URL}` + ].join('\n'); + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + body + }); diff --git a/.github/workflows/release-e-books.yaml b/.github/workflows/release-e-books.yaml new file mode 100644 index 00000000..ef378954 --- /dev/null +++ b/.github/workflows/release-e-books.yaml @@ -0,0 +1,63 @@ +name: Release e-books + +on: + push: + tags: + - 'v*.*' # Create a release only when a new tag matching v*.* is pushed. + # To also create a release for each push to the main branch, uncomment the following 2 lines: + # branches: + # - master + workflow_dispatch: {} # For manual runs. + +jobs: + release-ebooks: + runs-on: ubuntu-latest + + permissions: + contents: write + + steps: + - name: Checkout repository + uses: actions/checkout@v5 + + - name: Export all supported book formats from the Docker container + run: | + make run + make export + + - name: Copy generated files to host system + run: | + make cp + mkdir -p artifacts/ + mv "Linux Inside - 0xAX.epub" \ + "Linux Inside - 0xAX.mobi" \ + "Linux Inside - 0xAX.pdf" \ + "Linux Inside - 0xAX (A5).pdf" \ + artifacts/ + cp LICENSE artifacts/ + + - name: Prepare release metadata + # Use tag name when running on a tag, otherwise fall back to the short commit hash. + id: meta + env: + GITHUB_REF_TYPE: ${{ github.ref_type }} + GITHUB_REF_NAME: ${{ github.ref_name }} + run: | + DATE_UTC="$(date -u '+%m/%d/%Y %H:%M')" + if [ "${GITHUB_REF_TYPE}" = "tag" ] && [ -n "${GITHUB_REF_NAME}" ]; then + LABEL="${GITHUB_REF_NAME}" + else + LABEL="$(git rev-parse --short HEAD)" + fi + echo "release_name=${DATE_UTC} (${LABEL})" >> "$GITHUB_OUTPUT" + echo "tag_name=${LABEL}" >> "$GITHUB_OUTPUT" + + - name: Create GitHub release + uses: softprops/action-gh-release@v2 + with: + files: artifacts/* + name: ${{ steps.meta.outputs.release_name }} + tag_name: ${{ steps.meta.outputs.tag_name }} + target_commitish: ${{ github.sha }} + generate_release_notes: true + fail_on_unmatched_files: true diff --git a/Assets/linux-kernel.png b/Assets/linux-kernel.png index 7cc0df73..5ab0a316 100644 Binary files a/Assets/linux-kernel.png and b/Assets/linux-kernel.png differ diff --git a/Booting/README.md b/Booting/README.md index 6ed74a78..fbcf31db 100644 --- a/Booting/README.md +++ b/Booting/README.md @@ -1,12 +1,12 @@ # Kernel Boot Process -This chapter describes the linux kernel boot process. Here you will see a series of posts which describes the full cycle of the kernel loading process: +This chapter describes the Linux kernel boot process. Here you will see a series of posts which describes the full cycle of the kernel loading process: * [From the bootloader to kernel](linux-bootstrap-1.md) - describes all stages from turning on the computer to running the first instruction of the kernel. * [First steps in the kernel setup code](linux-bootstrap-2.md) - describes first steps in the kernel setup code. You will see heap initialization, query of different parameters like EDD, IST and etc... * [Video mode initialization and transition to protected mode](linux-bootstrap-3.md) - describes video mode initialization in the kernel setup code and transition to protected mode. * [Transition to 64-bit mode](linux-bootstrap-4.md) - describes preparation for transition into 64-bit mode and details of transition. * [Kernel Decompression](linux-bootstrap-5.md) - describes preparation before kernel decompression and details of direct decompression. -* [Kernel random address randomization](linux-bootstrap-6.md) - describes randomization of the Linux kernel load address. +* [Kernel load address randomization](linux-bootstrap-6.md) - describes randomization of the Linux kernel load address. This chapter coincides with `Linux kernel v4.17`. diff --git a/Booting/images/bss.png b/Booting/images/bss.png index c9b802e9..b7a1e9b5 100644 Binary files a/Booting/images/bss.png and b/Booting/images/bss.png differ diff --git a/Booting/images/early-bss.svg b/Booting/images/early-bss.svg new file mode 100644 index 00000000..3d29cc7d --- /dev/null +++ b/Booting/images/early-bss.svg @@ -0,0 +1,4 @@ + + + +
Stack
Stack
Kernel setup code
Kernel setup code
0x9000:0x9000
0x9000:0x9000
0x9200:0x0000
0x9200:0x0000
CS:IP
CS:IP
SS:SP
SS:SP
Kernel setup header
Kernel setup header
0x9000:0x0000
0x9000:0x0000
CS:IP
CS:IP
BSS
BSS
0x9000:0x5280
0x9000:0x5280
0x9000:0x3f00
0x9000:0x3f00
__bss_start
__bss_start
__bss_end
__bss_end
Text is not SVG - cannot display
\ No newline at end of file diff --git a/Booting/images/early-heap.svg b/Booting/images/early-heap.svg new file mode 100644 index 00000000..20c0e601 --- /dev/null +++ b/Booting/images/early-heap.svg @@ -0,0 +1,4 @@ + + + +
Stack
Stack
Kernel setup code
Kernel setup code
0x9000:0x9000
0x9000:0x9000
0x9200:0x0000
0x9200:0x0000
CS:IP
CS:IP
SS:SP
SS:SP
Kernel setup header
Kernel setup header
0x9000:0x0000
0x9000:0x0000
CS:IP
CS:IP
BSS
BSS
0x9000:0x5280
0x9000:0x5280
0x9000:0x3f00
0x9000:0x3f00
__bss_start
__bss_start
__bss_end
__bss_end
Heap
Heap
heap_end
heap_end
0x9000:0x8C00
0x9000:0x8C00
Text is not SVG - cannot display
\ No newline at end of file diff --git a/Booting/images/early-page-table.svg b/Booting/images/early-page-table.svg new file mode 100644 index 00000000..97bc6687 --- /dev/null +++ b/Booting/images/early-page-table.svg @@ -0,0 +1,4 @@ + + + +
11
11
0
0
12
12
20
20
21
21
29
29
30
30
31
31
32
32
47
47
33
33
PML4
index
PML4...
PDPT
index
PDPT...
PD
index
PD...
PT
index
PT...
Offset
Offset
Text is not SVG - cannot display
\ No newline at end of file diff --git a/Booting/images/early-stack.svg b/Booting/images/early-stack.svg new file mode 100644 index 00000000..2d493362 --- /dev/null +++ b/Booting/images/early-stack.svg @@ -0,0 +1,4 @@ + + + +
Stack
Stack
Kernel setup code
Kernel setup code
0x9000:0x9000
0x9000:0x9000
0x9200:0x0000
0x9200:0x0000
CS:IP
CS:IP
SS:SP
SS:SP
Kernel setup header
Kernel setup header
0x9000:0x0000
0x9000:0x0000
CS:IP
CS:IP
Text is not SVG - cannot display
\ No newline at end of file diff --git a/Booting/images/kernel_first_address.png b/Booting/images/kernel_first_address.png index 4fb0ddee..ba746f9e 100644 Binary files a/Booting/images/kernel_first_address.png and b/Booting/images/kernel_first_address.png differ diff --git a/Booting/images/linear_address.png b/Booting/images/linear_address.png deleted file mode 100644 index 6a9eaca0..00000000 Binary files a/Booting/images/linear_address.png and /dev/null differ diff --git a/Booting/images/minimal_stack.png b/Booting/images/minimal_stack.png index 123da34a..ac1ac9a1 100644 Binary files a/Booting/images/minimal_stack.png and b/Booting/images/minimal_stack.png differ diff --git a/Booting/images/segment-descriptor.svg b/Booting/images/segment-descriptor.svg new file mode 100644 index 00000000..fb3033af --- /dev/null +++ b/Booting/images/segment-descriptor.svg @@ -0,0 +1,4 @@ + + + +
LIMIT 19:16
LIMIT 19:16
0
0
31
31
15
15
16
16
32
32
63
63
39
39
BASE 31:34
BASE 31:34
G
G
B
B
D
D
/
/
L
L
A
A
V
V
L
L
P
P
DPL
DPL
S
S
TYPE
TYPE
0 | E | V | A
0 | E | V...
1 | C | R | A
1 | C | R...
BASE 23:16
BASE 23:16
56
56
51
51
48
48
45
45

BASE 15:0
BASE 15:0
LIMIT 15:0
LIMIT 15:0
Text is not SVG - cannot display
\ No newline at end of file diff --git a/Booting/images/segment-selector.svg b/Booting/images/segment-selector.svg new file mode 100644 index 00000000..64695c60 --- /dev/null +++ b/Booting/images/segment-selector.svg @@ -0,0 +1,4 @@ + + + +
Index
Index
RPL
RPL
TI
TI
15
15
3
3
2
2
0
0
1
1
Text is not SVG - cannot display
\ No newline at end of file diff --git a/Booting/images/simple_bootloader.png b/Booting/images/simple_bootloader.png deleted file mode 100644 index 2443974d..00000000 Binary files a/Booting/images/simple_bootloader.png and /dev/null differ diff --git a/Booting/images/stack1.png b/Booting/images/stack1.png index 534be3af..78d4c999 100644 Binary files a/Booting/images/stack1.png and b/Booting/images/stack1.png differ diff --git a/Booting/images/stack2.png b/Booting/images/stack2.png index 9eb8f0cd..3e278950 100644 Binary files a/Booting/images/stack2.png and b/Booting/images/stack2.png differ diff --git a/Booting/images/startup_32.svg b/Booting/images/startup_32.svg new file mode 100644 index 00000000..69ac9502 --- /dev/null +++ b/Booting/images/startup_32.svg @@ -0,0 +1,4 @@ + + + +
startup_32
startup_32
instructions
instructions
1:    popl %ebp
1:    popl %ebp
instructions
instructions
0x00100000 (runtime address)
0x00100000 (runtime address)
%ebp will point here after the sub
%ebp will point here after the sub
%ebp will point here right after pop
%ebp will point here right after pop
0x001000XX (runtime address of label 1)
0x001000XX (runtime address of label 1)
Text is not SVG - cannot display
\ No newline at end of file diff --git a/Booting/images/try_vmlinuz_in_qemu.png b/Booting/images/try_vmlinuz_in_qemu.png index e24b925b..5b44d698 100644 Binary files a/Booting/images/try_vmlinuz_in_qemu.png and b/Booting/images/try_vmlinuz_in_qemu.png differ diff --git a/Booting/images/video_mode_setup_menu.png b/Booting/images/video_mode_setup_menu.png index 78530173..99162ace 100644 Binary files a/Booting/images/video_mode_setup_menu.png and b/Booting/images/video_mode_setup_menu.png differ diff --git a/Booting/linux-bootstrap-1.md b/Booting/linux-bootstrap-1.md index 7393b63f..b63b336f 100644 --- a/Booting/linux-bootstrap-1.md +++ b/Booting/linux-bootstrap-1.md @@ -1,124 +1,151 @@ -Kernel booting process. Part 1. -================================================================================ +# Kernel Booting Process — Part 1 -From the bootloader to the kernel --------------------------------------------------------------------------------- +If you’ve read my earlier [posts](https://github.com/0xAX/asm) about [assembly language](https://en.wikipedia.org/wiki/Assembly_language) for Linux x86_64, you might see that I started to get interested in low-level programming. I’ve written a set of articles on assembly programming for [x86_64](https://en.wikipedia.org/wiki/X86-64) Linux and, in parallel, began exploring the Linux kernel source code. I’ve always been fascinated by what happens under the hood — how programs execute on a CPU, how they’re laid out in memory, how the kernel schedules processes and manages resources, how the network stack operates at a low level, and many other details. This series is a way of sharing my journey. -If you read my previous [blog posts](https://0xax.github.io/categories/assembler/), you might have noticed that I have been involved with low-level programming for some time. I wrote some posts about assembly programming for `x86_64` Linux and, at the same time, started to dive into the Linux kernel source code. +> [!NOTE] +> This is not official Linux kernel documentation, it is a learning project. I’m not a professional Linux kernel developer, and I don’t write kernel code as part of my daily job. Learning how the Linux kernel works is just my hobby. If you find anything unclear, spot an error, or have questions or suggestions, feel free to reach out - you always can ping me on X [0xAX](https://twitter.com/0xAX), send me an [email](mailto:anotherworldofworld@gmail.com) or open a new [issue](https://github.com/0xAX/linux-insides/issues/new). Your feedback is always welcome and appreciated. -I have a great interest in understanding how low-level things work, how programs run on my computer, how they are located in memory, how the kernel manages processes and memory, how the network stack works at a low level, and many many other things. So, I decided to write yet another series of posts about the Linux kernel for the **x86_64** architecture. +The main goal of this series is to provide a guide to the Linux kernel for readers who want to begin learning how it works. We will explore not only what the kernel does, but will try to understand how and why it does it. Despite being considered to be understandable for anyone who is interested in Linux kernel, it is highly recommended to have some prior knowledge before starting to read these notes. If you want to experiment with the kernel code, first of all it is best to have a [Linux distribution](https://en.wikipedia.org/wiki/Linux_distribution) installed. Besides that, on these pages we will see much of [C](https://en.wikipedia.org/wiki/C_(programming_language)) and [assembly](https://en.wikipedia.org/wiki/Assembly_language) code, so the good understanding of these programming languages is highly required. -Note that I'm not a professional kernel hacker and I don't write code for the kernel at work. It's just a hobby. I just like low-level stuff, and it is interesting for me to see how these things work. So if you notice anything confusing, or if you have any questions/remarks, ping me on Twitter [0xAX](https://twitter.com/0xAX), drop me an [email](mailto:anotherworldofworld@gmail.com) or just create an [issue](https://github.com/0xAX/linux-insides/issues/new). I appreciate it. +> [!IMPORTANT] +> I started writing this series when the latest version of the kernel was `3.18`. A lot has changed since then, and I am in the process of updating the content to reflect modern kernels where possible — now focusing on v6.16+. I’ll continue revising the posts as the kernel evolves. -All posts will also be accessible at [github repo](https://github.com/0xAX/linux-insides) and, if you find something wrong with my English or the post content, feel free to send a pull request. +That’s enough introduction — let’s dive into the Linux kernel! -*Note that this isn't official documentation, just learning and sharing knowledge.* +## The Magic Power Button - What happens next? -**Required knowledge** +Although this is a series of posts about Linux kernel, we will not jump straight into kernel code. First, let’s step back and look at what happens before the kernel even comes into play. Everything starts from the turning on a computer. And we will start from this point as well. -* Understanding C code -* Understanding assembly code (AT&T syntax) +When you press the "magic" power button on your laptop or desktop computer, the [motherboard](https://en.wikipedia.org/wiki/Motherboard) sends a signal to the [power supply](https://en.wikipedia.org/wiki/Power_supply). In response, the power supply delivers the proper amount of electricity to other components of the computer. Once the motherboard receives the [power good signal](https://en.wikipedia.org/wiki/Power_good_signal), it triggers the CPU to start. The CPU then performs a reset: it clears any leftover data in its registers and loads predefined values into each of them, preparing for the very first instructions of the boot process. -Anyway, if you're just starting to learn such tools, I will try to explain some parts during this and the following posts. Alright, this is the end of the simple introduction. Let's start to dive into the Linux kernel and low-level stuff! +Each **x86_64** processor begins execution in a special mode called [real mode](https://en.wikipedia.org/wiki/Real_mode). This mode exists for historical reasons - to be compatible with the earliest processors. Real mode is supported on all x86-compatible processors — from the original [8086](https://en.wikipedia.org/wiki/Intel_8086) to today’s modern 64-bit CPUs. -I started writing these posts at the time of the `3.18` Linux kernel, and many things have changed since that time. If there are changes, I will update the posts accordingly. +The **8086** was a 16-bit microprocessor. Basically it means that its general-purpose registers and instruction pointer were `16` bits wide. However, the chip was designed with a `20-bit` physical memory address bus — the set of electrical lines used to select memory locations. With `20` address lines, the CPU can form addresses from `0x00000` to `0xFFFFF`, giving access to exactly `1 MB` of physical memory or `2^20` bytes. -The Magical Power Button, What happens next? --------------------------------------------------------------------------------- +Because the registers on **8086** processors were only `16` bits wide, the largest value they could hold was `0xFFFF` which equals 64 KB. This means that, using just a single 16-bit value, the CPU could only directly address 64 KB of memory at a time. This leads us to the question - how can a processor with 16-bit registers access 20-bit addresses? The answer is [memory segmentation](https://en.wikipedia.org/wiki/Memory_segmentation). -Although this is a series of posts about the Linux kernel, we won't start directly from the kernel code. As soon as you press the magical power button on your laptop or desktop computer, it starts working. The motherboard sends a signal to the [power supply](https://en.wikipedia.org/wiki/Power_supply) device. After receiving the signal, the power supply provides the proper amount of electricity to the computer. Once the motherboard receives the [power good signal](https://en.wikipedia.org/wiki/Power_good_signal), it tries to start the CPU. The CPU resets all leftover data in its registers and sets predefined values for each of them. +To make use of the entire 1 MB space provided by the 20-bit address bus, the **8086** used a scheme called [memory segmentation](https://en.wikipedia.org/wiki/Memory_segmentation). All memory is divided into small, fixed-size segments of `65_536` bytes each. Instead of using just one value to identify a memory location, a CPU uses the two: -The [80386](https://en.wikipedia.org/wiki/Intel_80386) and later CPUs define the following predefined data in CPU registers after the computer resets: +1. Segment selector — identifies the starting point (base address) of a 64 KB segment. Represented by the value of the `cs` (code-segment) register. +2. Offset — specifies how far into that segment the target address is. Represented by the value of the `ip` register. + +In real mode, the base address for a given segment selector is calculated as: ``` -IP 0xfff0 -CS selector 0xf000 -CS base 0xffff0000 +Base Address = Segment Selector << 4 ``` -The processor starts working in [real mode](https://en.wikipedia.org/wiki/Real_mode). Let's back up a little and try to understand [memory segmentation](https://en.wikipedia.org/wiki/Memory_segmentation) in this mode. Real mode is supported on all x86-compatible processors, from the [8086](https://en.wikipedia.org/wiki/Intel_8086) CPU all the way to the modern Intel 64-bit CPUs. The `8086` processor has a 20-bit address bus, which means that it could work with a `0-0xFFFFF` or `1 megabyte` address space. But it only has `16-bit` registers, which have a maximum address of `2^16 - 1` or `0xffff` (64 kilobytes). - -[Memory segmentation](https://en.wikipedia.org/wiki/Memory_segmentation) is used to make use of all the address space available. All memory is divided into small, fixed-size segments of `65536` bytes (64 KB). Since we cannot address memory above `64 KB` with 16-bit registers, an alternate method was devised. - -An address consists of two parts: a segment selector, which has a base address; and an offset from this base address. In real mode, the associated base address of a segment selector is `Segment Selector * 16`. Thus, to get a physical address in memory, we need to multiply the segment selector part by `16` and add the offset to it: +To compute the final physical memory address, the CPU adds the base address to the offset: ``` -PhysicalAddress = Segment Selector * 16 + Offset +Physical Address = Base Address + Offset ``` -For example, if `CS:IP` is `0x2000:0x0010`, then the corresponding physical address will be: +For example, if the value of the `cs:ip` is `0x2000:0x0010`, then the corresponding physical address will be: ```python >>> hex((0x2000 << 4) + 0x0010) '0x20010' ``` -But, if we take the largest segment selector and offset, `0xffff:0xffff`, then the resulting address will be: +If we take the largest possible values for the segment selector and the offset - `0xFFFF:0xFFFF`, the resulting address will be: ```python >>> hex((0xffff << 4) + 0xffff) '0x10ffef' ``` -which is `65520` bytes past the first megabyte. Since only one megabyte is accessible in real mode, `0x10ffef` becomes `0x00ffef` with the [A20 line](https://en.wikipedia.org/wiki/A20_line) disabled. +This gives us the address `0x10FFEF`, which is `65_520` bytes past the 1 MB boundary. Since, in real mode on the original **8086** CPU, the CPU could only access the first 1 MB of memory, any address above `0xFFFFF` would wrap around back to the beginning of the address space. On modern **386+** CPUs the physical bus is wider even in real mode, but the address computation still based on the `segment:offset`. -Ok, now we know a little bit about real mode and its memory addressing. Let's get back to discussing register values after reset. +Now that we understand the basics of real mode and its memory addressing limitations, let’s return to the state after a hardware reset. -The `CS` register consists of two parts: the visible segment selector and the hidden base address. In real-address mode, the base address is normally formed by shifting the 16-bit segment selector value 4 bits to the left to produce a 20-bit base address. However, during a hardware reset the segment selector in the CS register is loaded with `0xf000` and the base address is loaded with `0xffff0000`. The processor uses this special base address until `CS` changes. +## First code executed after reset -The starting address is formed by adding the base address to the value in the EIP register: +The system has just been powered on, the reset signal has been released, and the processor is waking up to execute first instructions. The [80386](https://en.wikipedia.org/wiki/Intel_80386) and later CPUs set the following [register](https://en.wikipedia.org/wiki/X86#x86_registers) values after a hardware reset: + +| Register | Value | Meaning | +| ------------------ | ------------ | ------------------------------------------------------------------------------ | +| `ip` | `0xFFF0` | Instruction pointer; execution starts here within the current code segment | +| `cs` (selector) | `0xF000` | Visible code segment selector value after reset | +| `cs` (base) | `0xFFFF0000` | Hidden descriptor base address loaded into `cs` during reset | + +In real mode, the base address is normally formed by shifting the 16-bit segment selector value 4 bits left to produce a 20-bit physical address. However, after the hardware reset the first instruction will be located at the special address. We may see that the segment selector in the `cs` register is loaded with `0xF000` but the hidden base address is loaded with `0xFFFF0000`. Instead of using the usual formula to get the address, the processor uses this value as the base address of the first instruction. Having the value of the base address and the offset (from the `ip` register), the starting address will be: ```python ->>> 0xffff0000 + 0xfff0 +>>> hex(0xffff0000 + 0xfff0) '0xfffffff0' ``` -We get `0xfffffff0`, which is 16 bytes below 4GB. This point is called the [reset vector](https://en.wikipedia.org/wiki/Reset_vector). It's the memory location at which the CPU expects to find the first instruction to execute after reset. It contains a [jump](https://en.wikipedia.org/wiki/JMP_%28x86_instruction%29) (`jmp`) instruction that usually points to the [BIOS](https://en.wikipedia.org/wiki/BIOS) (Basic Input/Output System) entry point. For example, if we look in the [coreboot](https://www.coreboot.org/) source code (`src/cpu/x86/16bit/reset16.inc`), we see: +We got `0xFFFFFFF0`, which is 16 bytes below 4GB. This is the very first address where the CPU starts the execution after reset. This address has special name - [reset vector](https://en.wikipedia.org/wiki/Reset_vector). It is the memory location at which the CPU expects to find the first instruction to execute after reset. Usually it contains a [jump](https://en.wikipedia.org/wiki/JMP_%28x86_instruction%29) (`jmp`) instruction which points to the [BIOS](https://en.wikipedia.org/wiki/BIOS) or [UEFI](https://en.wikipedia.org/wiki/UEFI) entry point. For example, if we take a look at the [source code](https://github.com/coreboot/coreboot/blob/main/src/cpu/x86/reset16.S) of the [coreboot](https://www.coreboot.org/), we will see it there: + ```assembly - .section ".reset", "ax", %progbits - .code16 -.globl _start + /* This is the first instruction the CPU runs when coming out of reset. */ +.section ".reset", "ax", %progbits +.globl _start _start: - .byte 0xe9 - .int _start16bit - ( . + 2 ) - ... + jmp _start16bit ``` -Here we can see the `jmp` instruction [opcode](http://ref.x86asm.net/coder32.html#xE9), which is `0xe9`, and its destination address at `_start16bit - ( . + 2)`. +To prove that this code is located at the `0xFFFFFFF0` address, we may take a look at the [linker script](https://github.com/coreboot/coreboot/blob/master/src/arch/x86/bootblock.ld): -We also see that the `reset` section is `16` bytes and is compiled to start from the address `0xfffffff0` (`src/cpu/x86/16bit/reset16.ld`): - -``` -SECTIONS { - /* Trigger an error if I have an unusable start address */ - _bogus = ASSERT(_start16bit >= 0xffff0000, "_start16bit too low. Please report."); - _ROMTOP = 0xfffffff0; - . = _ROMTOP; - .reset . : { - *(.reset); - . = 15; - BYTE(0x00); - } -} + +```linker-script + . = 0xfffffff0; + _X86_RESET_VECTOR = .; + .reset . : { + *(.reset); + . = _X86_RESET_VECTOR_FILLING; + BYTE(0); + } ``` -Now the BIOS starts. After initializing and checking the hardware, the BIOS needs to find a bootable device. A boot order is stored in the BIOS configuration, controlling which devices the BIOS attempts to boot from. When attempting to boot from a hard drive, the BIOS tries to find a boot sector. On hard drives partitioned with an [MBR partition layout](https://en.wikipedia.org/wiki/Master_boot_record), the boot sector is stored in the first `446` bytes of the first sector, where each sector is `512` bytes. The final two bytes of the first sector are `0x55` and `0xaa`, which designates to the BIOS that this device is bootable. Once the BIOS finds the boot sector, it copies it into a fixed memory location at 0x7c00, jumps to there and start executing it. +The address `0xFFFFFFF0` is much larger than `0xFFFFF` (1MB). How can the CPU access this address in real mode? The answer is simple. Most likely you have something more modern than **8086** CPU with 20-bit address bus. More modern processors starts in real mode but with 32-bit or 64-bit bus. + +When the CPU wakes up, it reads the jump at the `0xFFFFFFF0` address, jump into the firmware, and the long chain of the boot process begins. This is the very first step on the way to boot the Linux kernel. + +## From Power-On to Bootloader + +We stopped at the point when a CPU jumps from the reset vector to the firmware. On a legacy PC, that means the BIOS. On modern computers it is UEFI. In the next chapters we will see the booting processes on a legacy PC using the BIOS, and later UEFI. -For example: +The first job of BIOS is to bring the system into a working state. It runs a series of hardware checks and initializations — memory tests, peripheral setup, chipset configuration — all part of the [POST](https://en.wikipedia.org/wiki/Power-on_self-test) routine. Once everything is checked, the next step is to find an operating system to boot. The BIOS doesn’t pick just a random disk. It follows a boot order, a list stored in its configuration. + +When the BIOS tries to boot from a hard drive, it looks for a [boot sector](https://en.wikipedia.org/wiki/Boot_sector). On hard drives partitioned with an [MBR partition layout](https://en.wikipedia.org/wiki/Master_boot_record), the boot sector is stored in the first `446` bytes of the first sector, where each sector is `512` bytes. The final two bytes of the first sector must be `0x55` and `0xAA`. These two last bytes says to BIOS somewhat like "yes - this device is bootable". Once the BIOS finds the valid boot sector, it copies it into the fixed memory location at `0x7C00`, jumps to there and start executing it. + +In general, real mode's memory map is as follows: + +| Address Range | Description | +|-----------------------|--------------------------------------| +| 0x00000000–0x000003FF | Real Mode Interrupt Vector Table | +| 0x00000400–0x000004FF | BIOS Data Area | +| 0x00000500–0x00007BFF | Unused | +| 0x00007C00–0x00007DFF | Bootloader | +| 0x00007E00–0x0009FFFF | Unused | +| 0x000A0000–0x000BFFFF | Video RAM (VRAM) Memory | +| 0x000B0000–0x000B7777 | Monochrome Video Memory | +| 0x000B8000–0x000BFFFF | Color Video Memory | +| 0x000C0000–0x000C7FFF | Video ROM BIOS | +| 0x000C8000–0x000EFFFF | BIOS Shadow Area | +| 0x000F0000–0x000FFFFF | System BIOS | + +We can do a simple experiment and create a very primitive boot code: ```assembly -; -; Note: this example is written in Intel Assembly syntax -; +;; +;; Note: this example is written using NASM assembler +;; [BITS 16] boot: + ;; Symbol to print mov al, '!' + ;; TTY-style text output mov ah, 0x0e + ;; Position where to print the character mov bh, 0x00 + ;; Color mov bl, 0x07 - + ;; Interrupt call int 0x10 jmp $ @@ -128,455 +155,396 @@ db 0x55 db 0xaa ``` -Build and run this with: +You can build and run this code using the following commands: -``` -nasm -f bin boot.nasm && qemu-system-x86_64 boot +```bash +nasm -f bin boot.S && qemu-system-x86_64 boot -nographic ``` -This will instruct [QEMU](https://www.qemu.org/) to use the `boot` binary that we just built as a disk image. Since the binary generated by the assembly code above fulfills the requirements of the boot sector (we end it with the magic sequence), QEMU will treat the binary as the master boot record (MBR) of a disk image. Note that when providing a boot binary image to QEMU, setting the origin to 0x7c00 (using `[ORG 0x7c00]`) -is unneeded. +This will instruct [QEMU](https://www.qemu.org/) virtual machine to use the `boot` binary that we just built as a disk image. Since the binary generated by the assembly code above fulfills the requirements of the boot sector (we end it with the magic sequence), QEMU will treat the binary as the master boot record (MBR) of a disk image. -You will see: - -![Simple bootloader which prints only `!`](images/simple_bootloader.png) - -In this example, we can see that the code will be executed in `16-bit` real mode. After starting, it calls the [0x10](http://www.ctyme.com/intr/rb-0106.htm) interrupt, which just prints the `!` symbol. The times directive will pad that number of bytes up to 510th byte with zeros and finishes with the two magic bytes `0xaa` and `0x55`. - -You can see a binary dump of this using the `objdump` utility: +If you did everything correctly, you will see something like this after run of the command above: ``` -nasm -f bin boot.nasm -objdump -D -b binary -mi386 -Maddr16,data16,intel boot -``` - -A real-world boot sector has code for continuing the boot process and a partition table instead of a bunch of 0's and an exclamation mark. :) From this point onwards, the BIOS hands control over to the bootloader. +SeaBIOS (version 1.17.0-5.fc42) -**NOTE**: As explained above, the CPU is in real mode. In real mode, calculating the physical address in memory is done as follows: +iPXE (https://ipxe.org) 00:03.0 CA00 PCI2.10 PnP PMM+06FCAEC0+06F0AEC0 CA00 -``` -PhysicalAddress = Segment Selector * 16 + Offset +Booting from Hard Disk... +! ``` -just as explained above. We have only 16-bit general purpose registers, which has a maximum value of `0xffff`, so if we take the largest values the result will be: +Of course, a real-world boot sector has "slightly" speaking more code for loading of an operating system instead of printing an exclamation mark, but it may interesting to experiment. In this example, we can see that the code will be executed in `16-bit` real mode which is specified by the `[BITS 16]` directive. After starting, it calls the [0x10](https://en.wikipedia.org/wiki/INT_10H) interrupt, which just prints the `!` symbol. The `times` directive will pad that number of bytes up to `510th` byte with zeros. In the end we "hard-code" the last two magic bytes `0xAA` and `0x55`. To exit from the virtual machine, you can press - `Ctrl+a x`. -```python ->>> hex((0xffff * 16) + 0xffff) -'0x10ffef' -``` +From this point onwards, the BIOS hands control over to the bootloader. -where `0x10ffef` is equal to `(1MB + 64KB - 16B) - 1`. An [8086](https://en.wikipedia.org/wiki/Intel_8086) processor (which was the first processor with real mode), in contrast, has a 20-bit address line. Since `2^20 = 1048576` is 1MB and `2^20 - 1` is the maximum address that could be used, this means that the actual available memory is 1MB. +## The Bootloader Stage -In general, real mode's memory map is as follows: +There are a number of different bootloaders that can boot Linux kernel, such as [GRUB 2](https://www.gnu.org/software/grub/), [syslinux](http://www.syslinux.org/wiki/index.php/The_Syslinux_Project), [systemd-boot](https://www.freedesktop.org/wiki/Software/systemd/systemd-boot/), and others. The Linux kernel has a [Boot protocol](https://github.com/torvalds/linux/blob/master/Documentation/arch/x86/boot.rst) which specifies the requirements for a bootloader to implement Linux support. In this chapter, we will take a short look how GRUB 2 does loading. -``` -0x00000000 - 0x000003FF - Real Mode Interrupt Vector Table -0x00000400 - 0x000004FF - BIOS Data Area -0x00000500 - 0x00007BFF - Unused -0x00007C00 - 0x00007DFF - Our Bootloader -0x00007E00 - 0x0009FFFF - Unused -0x000A0000 - 0x000BFFFF - Video RAM (VRAM) Memory -0x000B0000 - 0x000B7777 - Monochrome Video Memory -0x000B8000 - 0x000BFFFF - Color Video Memory -0x000C0000 - 0x000C7FFF - Video ROM BIOS -0x000C8000 - 0x000EFFFF - BIOS Shadow Area -0x000F0000 - 0x000FFFFF - System BIOS -``` - -At the beginning of this post, I wrote that the first instruction executed by the CPU is located at address `0xFFFFFFF0`, which is much larger than `0xFFFFF` (1MB). How can the CPU access this address in real mode? The answer is in the [coreboot](https://www.coreboot.org/Developer_Manual/Memory_map) documentation: - -``` -0xFFFE_0000 - 0xFFFF_FFFF: 128 kilobyte ROM mapped into address space -``` +Continuing from where we left off - the BIOS has now selected a boot device, found its boot sector, loaded it into memory and passed control to the code located there. GRUB 2 bootloader consists of multiple [stages](https://www.gnu.org/software/grub/manual/grub/grub.html#Images). The first stage of the boot code is in the [boot.S](https://github.com/rhboot/grub2/blob/master/grub-core/boot/i386/pc/boot.S) source code file. Due to limited amount of space for the first boot sector, this code has only single goal - to load [core image](https://www.gnu.org/software/grub/manual/grub/html_node/Images.html) into memory and jump to it. -At the start of execution, the BIOS is not in RAM, but in ROM. +The core image starts with [diskboot.S](https://github.com/rhboot/grub2/blob/master/grub-core/boot/i386/pc/diskboot.S), which is usually stored right after the first sector of the disk. The code from the `diskboot.S` file loads the rest of the core image into memory. The core image contains the code of the loader itself and drivers for reading different filesystems. After the whole core image is loaded into memory, the execution continues from the [grub_main](https://github.com/rhboot/grub2/blob/master/grub-core/kern/main.c) function. This is where GRUB sets up the environment it needs to operate: -Bootloader --------------------------------------------------------------------------------- +- Initializes the console so messages and menus can be displayed. +- Sets the root device — the disk from which GRUB will read files modules and configuration files. +- Loads and parses the GRUB configuration file. +- Loads required modules. -There are a number of bootloaders that can boot Linux, such as [GRUB 2](https://www.gnu.org/software/grub/) and [syslinux](http://www.syslinux.org/wiki/index.php/The_Syslinux_Project). The Linux kernel has a [Boot protocol](https://github.com/torvalds/linux/blob/v4.16/Documentation/x86/boot.txt) which specifies the requirements for a bootloader to implement Linux support. This example will describe GRUB 2. +Once these tasks are complete, we may see the familiar GRUB menu where we can choose the operating system we want to load. When we select one of the menu entries, GRUB executes the [boot](https://www.gnu.org/software/grub/manual/grub/grub.html#boot) command which boots the selected operating system. So how the loader loads the Linux kernel? To answer on this question, we need to get back to the Linux kernel boot protocol. -Continuing from before, now that the BIOS has chosen a boot device and transferred control to the boot sector code, execution starts from [boot.img](http://git.savannah.gnu.org/gitweb/?p=grub.git;a=blob;f=grub-core/boot/i386/pc/boot.S;hb=HEAD). Its code is very simple, due to the limited amount of space available. It contains a pointer which is used to jump to the location of GRUB 2's core image. The core image begins with [diskboot.img](http://git.savannah.gnu.org/gitweb/?p=grub.git;a=blob;f=grub-core/boot/i386/pc/diskboot.S;hb=HEAD), which is usually stored immediately after the first sector in the unused space before the first partition. The above code loads the rest of the core image, which contains GRUB 2's kernel and drivers for handling filesystems, into memory. After loading the rest of the core image, it executes the [grub_main](http://git.savannah.gnu.org/gitweb/?p=grub.git;a=blob;f=grub-core/kern/main.c) function. +As we can read in the [documentation](https://github.com/torvalds/linux/blob/master/Documentation/arch/x86/boot.rst), the bootloader must load the kernel into memory, fill some fields in the kernel setup header and pass control to the kernel code. The very first part of the kernel code is so-called kernel setup header and setup code. The kernel setup header is a special structure embedded in the early Linux boot code and provides fields that describes how kernel should be loaded and started. The setup header is started at the `0x01F1` offset from the beginning of the kernel image. We may look at the boot [linker script](https://github.com/torvalds/linux/blob/master/arch/x86/boot/setup.ld) to confirm the value of this offset: -The `grub_main` function initializes the console, gets the base address for modules, sets the root device, loads/parses the grub configuration file, loads modules, etc. At the end of execution, the `grub_main` function moves grub to normal mode. The `grub_normal_execute` function (from the `grub-core/normal/main.c` source code file) completes the final preparations and shows a menu to select an operating system. When we select one of the grub menu entries, the `grub_menu_execute_entry` function runs, executing the grub `boot` command and booting the selected operating system. + +```linker-script + . = ASSERT(hdr == 0x1f1, "The setup header has the wrong offset!"); +``` -As we can read in the kernel boot protocol, the bootloader must read and fill some fields of the kernel setup header, which starts at offset `0x01f1` from the kernel setup code. You may look at the boot [linker script](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/setup.ld) to confirm the value of this offset. The kernel header [arch/x86/boot/header.S](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/header.S) starts from: +The kernel [setup header](https://github.com/torvalds/linux/blob/master/arch/x86/boot/header.S) is split on two parts and the first part starts from the following fields: + ```assembly - .globl hdr + .globl hdr hdr: - setup_sects: .byte 0 - root_flags: .word ROOT_RDONLY - syssize: .long 0 - ram_size: .word 0 - vid_mode: .word SVGA_MODE - root_dev: .word 0 - boot_flag: .word 0xAA55 + .byte setup_sects - 1 +root_flags: .word ROOT_RDONLY +syssize: .long ZO__edata / 16 +ram_size: .word 0 /* Obsolete */ +vid_mode: .word SVGA_MODE +root_dev: .word 0 /* Default to major/minor 0/0 */ +boot_flag: .word 0xAA55 ``` -The bootloader must fill this and the rest of the headers (which are only marked as being type `write` in the Linux boot protocol, such as in [this example](https://github.com/torvalds/linux/blob/v4.16/Documentation/x86/boot.txt#L354)) with values either received from the command line or calculated during booting. (We will not go over full descriptions and explanations for all fields of the kernel setup header for now, but we shall do so when discussing how the kernel uses them. You can find a description of all fields in the [boot protocol](https://github.com/torvalds/linux/blob/v4.16/Documentation/x86/boot.txt#L156).) +The bootloader may fill some of these fields in the setup header which marked as being type `write` or `modify` in the Linux boot protocol. The values set by the bootloader will be taken from its configuration or will be calculated during boot. Of course we will not go over full descriptions and explanations of all the fields of the kernel setup header. Instead, we will take a look closer at this or that field if we will meet it during our research of the kernel code. -As we can see in the kernel boot protocol, memory will be mapped as follows after loading the kernel: +According to the Linux kernel boot protocol, memory will be mapped as follows after loading the kernel: -```shell - | Protected-mode kernel | -100000 +------------------------+ - | I/O memory hole | -0A0000 +------------------------+ - | Reserved for BIOS | Leave as much as possible unused - ~ ~ - | Command line | (Can also be below the X+10000 mark) -X+10000 +------------------------+ - | Stack/heap | For use by the kernel real-mode code. -X+08000 +------------------------+ - | Kernel setup | The kernel real-mode code. - | Kernel boot sector | The kernel legacy boot sector. - X +------------------------+ - | Boot loader | <- Boot sector entry point 0x7C00 -001000 +------------------------+ - | Reserved for MBR/BIOS | -000800 +------------------------+ - | Typically used by MBR | -000600 +------------------------+ - | BIOS use only | -000000 +------------------------+ +``` + ~ ~ + | Protected-mode kernel | +100000 +------------------------+ + | I/O memory hole | +0A0000 +------------------------+ + | Reserved for BIOS | Leave as much as possible unused + ~ ~ + | Command line | (Can also be below the X+10000 mark) +X+10000 +------------------------+ + | Stack/heap | For use by the kernel real-mode code. +X+08000 +------------------------+ + | Kernel setup | The kernel real-mode code. + | Kernel boot sector | The kernel legacy boot sector. +X +------------------------+ + | Boot loader | <- Boot sector entry point 0000:7C00 +001000 +------------------------+ + | Reserved for MBR/BIOS | +000800 +------------------------+ + | Typically used by MBR | +000600 +------------------------+ + | BIOS use only | +000000 +------------------------+ +... where the address X is as low as the design of the boot loader permits. ``` -When the bootloader transfers control to the kernel, it starts at: +We can see that when the bootloader transfers control to the kernel, execution starts right after the kernel’s boot sector — that is, at the address `X` plus the length of the boot sector. The value of this `X` depends on how the kernel loaded. For example if I try to load kernel just with [qemu](https://www.qemu.org/), the starting address of the kernel image is at `0x10000`: -``` -X + sizeof(KernelBootSector) + 1 +```bash +hexdump -C /tmp/dump | grep MZ +00010000 4d 5a 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |MZ..............| ``` -where `X` is the address of the kernel boot sector being loaded. In my case, `X` is `0x10000`, as we can see in a memory dump: +Linux kernel image starts from `4D 5A` bytes as you may see in the beginning of the kernel setup code: -![kernel first address](images/kernel_first_address.png) - -How to get this memory dump in real mode? --------------------------------------------------------------------------------- -``` -root@parallels-vm:/usr/src/linux# more arch/x86/kernel/vmlinux.lds -... -SECTIONS -{ - . = (0xffffffff80000000 + ALIGN(0x1000000, 0x200000)); - phys_startup_64 = ABSOLUTE(startup_64 - 0xffffffff80000000); - .text : AT(ADDR(.text) - 0xffffffff80000000) { - _text = .; - _stext = .; -.... + +```assembly + .code16 + .section ".bstext", "ax" +#ifdef CONFIG_EFI_STUB + # "MZ", MS-DOS header + .word IMAGE_DOS_SIGNATURE ``` -``` -root@parallels-vm:/usr/src/linux# nm vmlinux|grep startup_64 -0000000001000000 A phys_startup_64 -ffffffff81000030 T secondary_startup_64 -ffffffff810001f0 T __startup_64 -ffffffff81000000 T startup_64 -``` +If you want to get a similar memory dump, follow these steps. First of all, you need to build kernel. If you do not know how to do it, you can find detailed instruction [here](https://github.com/0xAX/linux-insides/blob/master/Misc/linux-misc-1.md). On the diagram above, we can see that the `Protected-mode` kernel starts from `0x100000`. Knowing this address we can start the kernel in the qemu virtual machine with the following command: -Here we can see the memory address of the entry point, which is `0x0000000001000000`. Let's go ahead. +```bash +sudo qemu-system-x86_64 -kernel ./linux/arch/x86/boot/bzImage \ + -nographic \ + -append "console=ttyS0 nokaslr" \ + -initrd /boot/initramfs-6.17.0-rc1-g8f5ae30d69d7.img -s -S +``` -Before trying to debug the kernel, please see [Booting a Custom Linux Kernel in QEMU and Debugging It With GDB](http://nickdesaulniers.github.io/blog/2018/10/24/booting-a-custom-linux-kernel-in-qemu-and-debugging-it-with-gdb/) +After the virtual machine is started, we can attach the debugger to it, set up a breakpoint on the entry point and get the dump: -#### Step 1 -Booting in QEMU -``` -qemu-system-x86_64 -kernel /usr/src/linux-4.14.207/arch/x86_64/boot/bzImage -nographic -append "console=ttyS0 nokaslr" -initrd /data/busybox/busybox-1.28.0/initramfs.cpio.gz -S -s -``` -#### Step 2 -Attaching GDB to QEMU -``` +```bash gdb vmlinux (gdb) target remote :1234 -(gdb) hbreak *0x0000000001000000 +(gdb) hbreak *0x100000 (gdb) c +Continuing. + +Breakpoint 1, 0x0000000000100000 in ?? () (gdb) dump binary memory /tmp/dump 0x0000 0x20000 ``` -#### Step 3 -``` -root@parallels-vm:/# hd /tmp/dump |grep -A 31 MZ -00010000 4d 5a ea 07 00 c0 07 8c c8 8e d8 8e c0 8e d0 31 |MZ.............1| -00010010 e4 fb fc be 40 00 ac 20 c0 74 09 b4 0e bb 07 00 |....@.. .t......| -00010020 cd 10 eb f2 31 c0 cd 16 cd 19 ea f0 ff 00 f0 00 |....1...........| -00010030 00 00 00 00 00 00 00 00 00 00 00 00 82 00 00 00 |................| -00010040 55 73 65 20 61 20 62 6f 6f 74 20 6c 6f 61 64 65 |Use a boot loade| -00010050 72 2e 0d 0a 0a 52 65 6d 6f 76 65 20 64 69 73 6b |r....Remove disk| -00010060 20 61 6e 64 20 70 72 65 73 73 20 61 6e 79 20 6b | and press any k| -00010070 65 79 20 74 6f 20 72 65 62 6f 6f 74 2e 2e 2e 0d |ey to reboot....| -00010080 0a 00 50 45 00 00 64 86 04 00 00 00 00 00 00 00 |..PE..d.........| -00010090 00 00 01 00 00 00 a0 00 06 02 0b 02 02 14 20 d5 |.............. .| -000100a0 80 00 00 00 00 00 e0 b8 79 01 80 46 00 00 00 02 |........y..F....| -000100b0 00 00 00 00 00 00 00 00 00 00 20 00 00 00 20 00 |.......... ... .| -000100c0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................| -000100d0 00 00 00 90 fa 01 00 02 00 00 00 00 00 00 0a 00 |................| -000100e0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................| -* -00010100 00 00 00 00 00 00 06 00 00 00 00 00 00 00 00 00 |................| -00010110 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................| -* -00010130 00 00 00 00 00 00 00 00 00 00 2e 73 65 74 75 70 |...........setup| -00010140 00 00 e0 41 00 00 00 02 00 00 e0 41 00 00 00 02 |...A.......A....| -00010150 00 00 00 00 00 00 00 00 00 00 00 00 00 00 20 00 |.............. .| -00010160 50 60 2e 72 65 6c 6f 63 00 00 20 00 00 00 e0 43 |P`.reloc.. ....C| -00010170 00 00 20 00 00 00 e0 43 00 00 00 00 00 00 00 00 |.. ....C........| -00010180 00 00 00 00 00 00 40 00 10 42 2e 74 65 78 74 00 |......@..B.text.| -00010190 00 00 20 93 80 00 00 44 00 00 20 93 80 00 00 44 |.. ....D.. ....D| -000101a0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 20 00 |.............. .| -000101b0 50 60 2e 62 73 73 00 00 00 00 e0 b8 79 01 20 d7 |P`.bss......y. .| -000101c0 80 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................| -000101d0 00 00 00 00 00 00 80 00 00 c8 00 00 00 00 00 00 |................| -000101e0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ff |................| -000101f0 ff 21 01 00 32 09 08 00 00 00 ff ff 00 00 55 aa |.!..2.........U.| -``` -The bootloader has now loaded the Linux kernel into memory, filled the header fields, and then jumped to the corresponding memory address. We now move directly to the kernel setup code. +After this you should be able to find your dump in the `/tmp/dump`. + +If we try to load Linux kernel using GRUB 2 bootloader, this `X` address will be `0x90000`. Let's take a look how to do it and check. First of all you need to prepare image with kernel and GRUB 2. To do so execute the following commands: + +```bash +qemu-img create hdd.img 64M +parted hdd.img --script mklabel msdos +parted hdd.img --script mkpart primary ext2 1MiB 100% +parted hdd.img --script set 1 boot on +sudo losetup -fP hdd.img +sudo mkfs.ext2 /dev/loop0p1 +sudo mount /dev/loop0p1 /mnt/tmp +sudo mkdir -p /mnt/tmp/boot/grub +sudo grub2-install \ + --target=i386-pc \ + --boot-directory=/mnt/tmp/boot \ + /dev/loop0 +sudo cp ./arch/x86/boot/bzImage /mnt/tmp/boot/ +sudo tee /mnt/tmp/boot/grub/grub.cfg > /dev/null < ```assembly - .globl _start _start: - .byte 0xeb - .byte start_of_setup-1f + # Explicitly enter this as bytes, or the assembler + # tries to generate a 3-byte jump here, which causes + # everything else to push off to the wrong offset. + .byte 0xeb # short (2-byte) jump + .byte start_of_setup-1f 1: - // - // rest of the header - // ``` -Here we can see a `jmp` instruction opcode (`0xeb`) that jumps to the `start_of_setup-1f` point. In `Nf` notation, `2f`, for example, refers to the local label `2:`. In our case, it's label `1:` that is present right after the jump, and contains the rest of the setup [header](https://github.com/torvalds/linux/blob/v4.16/Documentation/x86/boot.txt#L156). Right after the setup header, we see the `.entrytext` section, which starts at the `start_of_setup` label. - -This is the first code that actually runs (aside from the previous jump instructions, of course). After the kernel setup part receives control from the bootloader, the first `jmp` instruction is located at the `0x200` offset from the start of the kernel real mode, i.e., after the first 512 bytes. This can be seen in both the Linux kernel boot protocol and the GRUB 2 source code: +The very first instruction we encounter here is the jump specified by the `0xEB` opcode. The second byte is the distance where to jump. If you’ve never met the `Nf` syntax before, `1f` means the next label `1` that will appear in the code. And immediately after those two bytes is the label `1` which is located right before the beginning of the second part of the kernel setup header. Right after the second part of the setup header, we see the `.entrytext` section, which starts at the `start_of_setup` label. This is exactly the place where the execution will be continued. But from where we are jumping? After the kernel setup code receives control from the bootloader, the first `jmp` instruction is located at the `0x200` bytes offset from the start of the loaded kernel image. This can be seen in both the Linux kernel boot protocol and the GRUB 2 [source code](https://github.com/rhboot/grub2/blob/master/grub-core/loader/i386/pc/linux.c): ```C segment = grub_linux_real_target >> 4; state.gs = state.fs = state.es = state.ds = state.ss = segment; state.cs = segment + 0x20; +state.ip = 0; ``` -In my case, the kernel is loaded at the physical address `0x10000`. This means that segment registers have the following values after kernel setup starts: +Here, `grub_linux_real_target` is the physical load address of the setup code. As we have seen in the previous section, this address is usually `0x90000`. Shifting it right by four divides it by `16`, converting a physical address into a segment value - that’s how real mode memory segmentation works. Then GRUB adds `0x20` to `cs` before starting execution. Why `0x20`? Let's remember that in real mode, physical addresses are computed as: ``` -gs = fs = es = ds = ss = 0x1000 -cs = 0x1020 +Physical = (cs << 4) + ip ``` -After the jump to `start_of_setup`, the kernel needs to do the following: +With `ip = 0` and `cs` increased by `0x20`, the offset from the start of the loaded image is: -* Make sure that all segment register values are equal -* Set up a correct stack, if needed -* Set up [bss](https://en.wikipedia.org/wiki/.bss) -* Jump to the C code in [arch/x86/boot/main.c](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/main.c) +``` +0x20 << 4 = 0x200 +``` -Let's look at the implementation. +This is 512 bytes — exactly the offset where our jump instruction resides in the image. -Aligning the Segment Registers --------------------------------------------------------------------------------- +After the jump to the `start_of_setup` label, the kernel setup code enters the very first phase of its real work: -First of all, the kernel ensures that the `ds` and `es` segment registers point to the same address. Next, it clears the direction flag using the `cld` instruction: +- Unifying the segment registers +- Establishing a valid stack +- Clearing the `.bss` section +- Transitioning into C code -```assembly - movw %ds, %ax - movw %ax, %es - cld -``` +In the next sections, we’ll walk through each of these steps in detail. -As I wrote earlier, `grub2` loads kernel setup code at address `0x10000` by default and `cs` at `0x1020` because execution doesn't start from the start of the file, but from the jump here: +### Aligning the segment registers -```assembly -_start: - .byte 0xeb - .byte start_of_setup-1f -``` - -which is at a `512` byte offset from [4d 5a](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/header.S#L46). We also need to align `cs` from `0x1020` to `0x1000`, as well as all other segment registers. After that, we set up the stack: +First of all, the kernel setup code ensures that the `ds` and `es` segment registers point to the same address. Next, it clears the [direction flag](https://en.wikipedia.org/wiki/Direction_flag) using the `cld` instruction: + ```assembly - pushw %ds - pushw $6f - lretw + .section ".entrytext", "ax" +start_of_setup: +# Force %es = %ds + movw %ds, %ax + movw %ax, %es + cld ``` -which pushes the value of `ds` to the stack, followed by the address of the [6](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/header.S#L602) label and executes the `lretw` instruction. When the `lretw` instruction is called, it loads the address of label `6` into the [instruction pointer](https://en.wikipedia.org/wiki/Program_counter) register and loads `cs` with the value of `ds`. Afterward, `ds` and `cs` will have the same values. +We need to do both of these two things to clear the [bss](https://en.wikipedia.org/wiki/.bss) section properly a bit later. From this point we are sure that both `ds` and `es` segment registers point to the same address - `0x9000`. -Stack Setup --------------------------------------------------------------------------------- +### Stack Setup -Almost all of the setup code is for preparing the C language environment in real mode. The next [step](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/header.S#L575) is checking the `ss` register's value and setting up a correct stack if `ss` is wrong: +We need to prepare for C language environment. The next step is to setup the stack. Let's take a look at the next lines of the code: + ```assembly - movw %ss, %dx - cmpw %ax, %dx - movw %sp, %dx - je 2f + movw %ss, %dx + cmpw %ax, %dx # %ds == %ss? + movw %sp, %dx + je 2f # -> assume %sp is reasonably set ``` -This can lead to 3 different scenarios: - -* `ss` has a valid value `0x1000` (as do all the other segment registers besides `cs`) -* `ss` is invalid and the `CAN_USE_HEAP` flag is set (see below) -* `ss` is invalid and the `CAN_USE_HEAP` flag is not set (see below) - -Let's look at all three of these scenarios in turn: - -* `ss` has a correct address (`0x1000`). In this case, we go to label [2](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/header.S#L589): +Here we compare the value of the `ss` and `ds` registers. According to the comment around this code, only old versions of the [LILO](https://en.wikipedia.org/wiki/LILO_(bootloader)) bootloader may set these registers to different values. So we will skip all the "edge cases" and consider only single case when the value of the `ss` register equal to `ds`. Since the values of these registers are equal, we jump to the `2` label: + ```assembly -2: andw $~3, %dx - jnz 3f - movw $0xfffc, %dx -3: movw %ax, %ss - movzwl %dx, %esp - sti +2: # Now %dx should point to the end of our stack space + andw $~3, %dx # dword align (might as well...) + jnz 3f + movw $0xfffc, %dx # Make sure we're not zero +3: movw %ax, %ss + movzwl %dx, %esp # Clear upper half of %esp + sti # Now we should have a working stack ``` -Here we set the alignment of `dx` (which contains the value of `sp` as given by the bootloader) to `4` bytes and check if it is zero. If it is, we set `dx` to `0xfffc` (The last 4-byte aligned address in a 64KB segment). If it is not zero, we continue to use the value of `sp` given by the bootloader (`0xf7f4` in my case). Afterwards, we put the value of `ax` (`0x1000`) into `ss`. We now have a correct stack: - -![stack](images/stack1.png) - -* The second scenario, (`ss` != `ds`). First, we put the value of [_end](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/setup.ld) (the address of the end of the setup code) into `dx` and check the `loadflags` header field using the `testb` instruction to see whether we can use the heap. [loadflags](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/header.S#L320) is a bitmask header defined as: +`dx` register stores stack pointer value whish should point to the top of the stack. The value of the stack pointer is `0x9000`. GRUB 2 bootloader sets it during loading of the Linux kernel image and the address is defined by the: + ```C -#define LOADED_HIGH (1<<0) -#define QUIET_FLAG (1<<5) -#define KEEP_SEGMENTS (1<<6) -#define CAN_USE_HEAP (1<<7) +#define GRUB_LINUX_SETUP_STACK 0x9000 ``` -and as we can read in the boot protocol: +At the next step we check that the address is aligned by four bytes and if yes jump to the label `3`. If the stack pointer is not aligned, we set it to `0xFFFC` value. The reason for this that we can not have stack pointer equal to zero as it grows down during pushing something on the stack. The `0xFFFC` value is the highest 4‑byte aligned address below `0x10000`. If the value of the stack pointer is aligned, we continue to use the aligned value. -``` -Field name: loadflags +From this point we have a correct stack and starts from `0x9000:0x9000` and grows down: - This field is a bitmask. +![early-stack](./images/early-stack.svg) - Bit 7 (write): CAN_USE_HEAP - Set this bit to 1 to indicate that the value entered in the - heap_end_ptr is valid. If this field is clear, some setup code - functionality will be disabled. -``` +### BSS Setup -If the `CAN_USE_HEAP` bit is set, we put `heap_end_ptr` into `dx` (which points to `_end`) and add `STACK_SIZE` (the minimum stack size, `1024` bytes) to it. After this, if `dx` is not carried (it will not be carried, `dx = _end + 1024`), jump to label `2` (as in the previous case) and make a correct stack. +Before the kernel can switch to C code, two final tasks must be done: -![stack](images/stack2.png) +- Verify the “magic” signature. +- Clear the `.bss` section. -* When `CAN_USE_HEAP` is not set, we just use a minimal stack from `_end` to `_end + STACK_SIZE`: +The first is the signature checking: -![minimal stack](images/minimal_stack.png) + +```assembly + cmpl $0x5a5aaa55, setup_sig + jne setup_bad +``` -BSS Setup --------------------------------------------------------------------------------- +This simply compares the [setup_sig](https://github.com/torvalds/linux/blob/master/arch/x86/boot/setup.ld) constant value placed by the linker with the magic number `0x5A5AAA55`. If they are not equal, the setup code reports a fatal error and stops execution. The main goal of this check is to ensure we are actually running a valid Linux kernel setup binary, loaded into the proper place by the bootloader. -The last two steps that need to happen before we can jump to the main C code are setting up the [BSS](https://en.wikipedia.org/wiki/.bss) area and checking the "magic" signature. First, signature checking: +With the magic number confirmed, and knowing our segment registers and stack are already in the proper state, the only initialization left is to clear the `.bss` section. The section of memory is used to store statically allocated, uninitialized data. Let's take a look at the initialization of this memory area: + ```assembly - cmpl $0x5a5aaa55, setup_sig - jne setup_bad + movw $__bss_start, %di + movw $_end+3, %cx + xorl %eax, %eax + subw %di, %cx + shrw $2, %cx + rep stosl ``` -This simply compares the [setup_sig](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/setup.ld) with the magic number `0x5a5aaa55`. If they are not equal, a fatal error is reported. +The main goal of this code is to clear or in other words to fill with zeros the memory area between `__bss_start` and `_end`. To fill this memory area with zeros, the `rep stos` instruction is used. This instruction puts the value of the `eax` register to the destination pointed by the `es:di`. That is why we unified the values of the `ds` and `es` registers. The `rep` prefix specifies the repetition of the `stos` instruction based on the value of the `cx` register. -If the magic number matches, knowing we have a set of correct segment registers and a stack, we only need to set up the BSS section before jumping into the C code. +To clear this memory area, at first we set the borders of this area - from the [__bss_start](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/setup.ld) to `_end + 3`. We add `3` bytes to the `_end` address because we are going to write zeros in double words or 4 bytes at a time). Adding three bytes ensures that when we later divide by four, any reminder at the end of the memory area still get covered. After we setup the borders of the memory area and fill the `eax` with 0 using the `xor` instruction, the `rep stosl` does its job. -The BSS section is used to store statically allocated, uninitialized data. Linux carefully ensures this area of memory is first zeroed using the following code: +The effect of this code is that zeros are written through the all memory from `__bss_start` to `_end`. To know exact addresses of them we can inspect `setup.elf` file with [readelf](https://en.wikipedia.org/wiki/Readelf) utility: -```assembly - movw $__bss_start, %di - movw $_end+3, %cx - xorl %eax, %eax - subw %di, %cx - shrw $2, %cx - rep; stosl +```bash +$ readelf -a arch/x86/boot/setup.elf | grep bss + [12] .bss NOBITS 00003f00 004efc 001380 00 WA 0 0 32 + 00 .bstext .header .entrytext .inittext .initdata .text .text32 .rodata .videocards .data .signature .bss + 145: 00005280 0 NOTYPE GLOBAL DEFAULT 12 __bss_end + 169: 00003f00 0 NOTYPE GLOBAL DEFAULT 12 __bss_start ``` -First, the [__bss_start](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/setup.ld) address is moved into `di`. Next, the `_end + 3` address (+3 - aligns to 4 bytes) is moved into `cx`. The `eax` register is cleared (using the `xor` instruction), and the bss section size (`cx - di`) is calculated and put into `cx`. Then, `cx` is divided by four (the size of a 'word'), and the `stosl` instruction is used repeatedly, storing the value of `eax` (zero) into the address pointed to by `di`, automatically increasing `di` by four, repeating until `cx` reaches zero. The net effect of this code is that zeros are written through all words in memory from `__bss_start` to `_end`: +These offsets inside the setup segment. Since in our case the kernel image is loaded at physical address `0x90000`, the symbols translate to: + +- __bss_start = 0x90000 + 0x3f00 = 0x93F00 +- __bss_end = 0x90000 + 0x5280 = 0x95280 -![bss](images/bss.png) +The following diagram illustrates how the setup image, `.bss`, and the stack region are laid out in memory: -Jump to main --------------------------------------------------------------------------------- +![bss](./images/early-bss.svg) -That's all! We have the stack and BSS, so we can jump to the `main()` C function: +### Jump to C code +At this point we have initialized the [stack](#stack-setup) and [.bss](#bss-setup) sections. The last instruction of the early kernel setup assembly is to jump to C code: + + ```assembly - calll main + calll main ``` -The `main()` function is located in [arch/x86/boot/main.c](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/main.c). You can read about what this does in the next part. +The `main()` function is located in [arch/x86/boot/main.c](https://github.com/torvalds/linux/blob/master/arch/x86/boot/main.c) source code file. + +What's happening there, we will see in the next chapter. -Conclusion --------------------------------------------------------------------------------- +## Conclusion -This is the end of the first part about Linux kernel insides. If you have questions or suggestions, ping me on Twitter [0xAX](https://twitter.com/0xAX), drop me an [email](mailto:anotherworldofworld@gmail.com), or just create an [issue](https://github.com/0xAX/linux-internals/issues/new). In the next part, we will see the first C code that executes in the Linux kernel setup, the implementation of memory routines such as `memset`, `memcpy`, `earlyprintk`, early console implementation and initialization, and much more. +This is the end of the first part about Linux kernel insides. If you have questions or suggestions, feel free ping me on X - [0xAX](https://twitter.com/0xAX), drop me an [email](mailto:anotherworldofworld@gmail.com), or just create an [issue](https://github.com/0xAX/linux-insides/issues/new). In the next part, we will see the first C code that executes in the Linux kernel setup, the implementation of memory routines such as `memset`, `memcpy`, `earlyprintk`, early console implementation and initialization, and much more. -**Please note that English is not my first language and I am really sorry for any inconvenience. If you find any mistakes please send me PR to [linux-insides](https://github.com/0xAX/linux-internals).** +## Links -Links --------------------------------------------------------------------------------- +Here is the list of the links that you may find useful during reading of this chapter: - * [Intel 80386 programmer's reference manual 1986](http://css.csail.mit.edu/6.858/2014/readings/i386.pdf) - * [Minimal Boot Loader for Intel® Architecture](https://www.cs.cmu.edu/~410/doc/minimal_boot.pdf) - * [Minimal Boot Loader in Assembler with comments](https://github.com/Stefan20162016/linux-insides-code/blob/master/bootloader.asm) - * [8086](https://en.wikipedia.org/wiki/Intel_8086) - * [80386](https://en.wikipedia.org/wiki/Intel_80386) - * [Reset vector](https://en.wikipedia.org/wiki/Reset_vector) - * [Real mode](https://en.wikipedia.org/wiki/Real_mode) - * [Linux kernel boot protocol](https://www.kernel.org/doc/Documentation/x86/boot.txt) - * [coreboot developer manual](https://www.coreboot.org/Developer_Manual) - * [Ralf Brown's Interrupt List](http://www.ctyme.com/intr/int.htm) - * [Power supply](https://en.wikipedia.org/wiki/Power_supply) - * [Power good signal](https://en.wikipedia.org/wiki/Power_good_signal) +- [Intel 80386 programmer's reference manual 1986](http://css.csail.mit.edu/6.858/2014/readings/i386.pdf) +- [Minimal Boot Loader for Intel® Architecture](https://www.cs.cmu.edu/~410/doc/minimal_boot.pdf) +- [Minimal Boot Loader in Assembler with comments](https://github.com/Stefan20162016/linux-insides-code/blob/master/bootloader.asm) +- [8086](https://en.wikipedia.org/wiki/Intel_8086) +- [80386](https://en.wikipedia.org/wiki/Intel_80386) +- [Reset vector](https://en.wikipedia.org/wiki/Reset_vector) +- [Real mode](https://en.wikipedia.org/wiki/Real_mode) +- [Linux kernel boot protocol](https://www.kernel.org/doc/Documentation/x86/boot.rst) +- [Ralf Brown's Interrupt List](http://www.ctyme.com/intr/int.htm) +- [Power supply](https://en.wikipedia.org/wiki/Power_supply) +- [Power good signal](https://en.wikipedia.org/wiki/Power_good_signal) diff --git a/Booting/linux-bootstrap-2.md b/Booting/linux-bootstrap-2.md index 508d6d33..c3454d44 100644 --- a/Booting/linux-bootstrap-2.md +++ b/Booting/linux-bootstrap-2.md @@ -1,418 +1,363 @@ -Kernel booting process. Part 2. -================================================================================ +# Kernel booting process - Part 2. -First steps in the kernel setup --------------------------------------------------------------------------------- +We have already started our journey into the Linux kernel in the previous [part](./linux-bootstrap-1.md), where we have walked through the very early stages of the booting process and first assembly instructions of the Linux kernel code. Aside from different mechanisms, this code was responsible to prepare environment for [C](https://en.wikipedia.org/wiki/C_(programming_language)) programming language. At the end of chapter we reached a symbolic milestone - the very first call of a C function. This function has classical name - `main` and defined in the [arch/x86/boot/main.c](https://github.com/torvalds/linux/blob/master/arch/x86/boot/main.c) source code file. -We started to dive into the linux kernel's insides in the previous [part](linux-bootstrap-1.md) and saw the initial part of the kernel setup code. We stopped at the first call to the `main` function (which is the first function written in C) from [arch/x86/boot/main.c](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/main.c). +From here on, we will start to see assembler code more and more rare, but it is not the end 🤓 We still will meet some assembly code on our way, but it will be more rare and rare. But now it is time for more "high level" logic! -In this part, we will continue to research the kernel setup code and go over -* what `protected mode` is, -* the transition into it, -* the initialization of the heap and the console, -* memory detection, CPU validation and keyboard initialization -* and much much more. +In this part, we’ll keep digging through the kernel’s setup code and cover: -So, let's go ahead. +- What [protected mode](https://en.wikipedia.org/wiki/Protected_mode) is on x86 processors +- Setup of early [heap](https://en.wikipedia.org/wiki/Memory_management#HEAP) and console +- Detection of available memory +- Validation of a CPU +- Initialization of a keyboard -Protected mode --------------------------------------------------------------------------------- +Time to explore these steps in detail! -Before we can move to the native Intel64 [Long Mode](http://en.wikipedia.org/wiki/Long_mode), the kernel must switch the CPU into protected mode. +## Protected mode -What is [protected mode](https://en.wikipedia.org/wiki/Protected_mode)? Protected mode was first added to the x86 architecture in 1982 and was the main mode of Intel processors from the [80286](http://en.wikipedia.org/wiki/Intel_80286) processor until Intel 64 and long mode came. +The Linux kernel for x86_64 operates in a special mode called - [long mode](http://en.wikipedia.org/wiki/Long_mode). One of the main goal of all the setup kernel code is to switch to this mode. But before we can move to this mode, the kernel must switch the CPU into [protected mode](https://en.wikipedia.org/wiki/Protected_mode). -The main reason to move away from [Real mode](http://wiki.osdev.org/Real_Mode) is that there is very limited access to the RAM. As you may remember from the previous part, there are only 220 bytes or 1 Megabyte, sometimes even only 640 Kilobytes of RAM available in Real mode. +What is [protected mode](https://en.wikipedia.org/wiki/Protected_mode)? From the previous chapter we already know that currently CPU operates in [real mode](https://en.wikipedia.org/wiki/Real_mode). For us it is mostly means - memory segmentation. As a short reminder - to access a memory location, the combination of two CPU [registers](https://en.wikipedia.org/wiki/Processor_register) is used: -Protected mode brought many changes, but the main one is the difference in memory management. The 20-bit address bus was replaced with a 32-bit address bus. It allowed access to 4 Gigabytes of memory vs the 1 Megabyte in Real mode. Also, [paging](http://en.wikipedia.org/wiki/Paging) support was added, which you can read about in the next sections. +- A segment register - `cs`, `ds`, `ss` and `es` which defines segment selector. +- A general purpose register which specifies offset within the segment. -Memory management in Protected mode is divided into two, almost independent parts: +The main motivation for switching from real mode is its memory limitation. As we saw in the previous part, real mode can address only 220 bytes. This is just 1 MB of RAM. Obviously modern software including an operating system kernel need more. To break this constraints, the new processor mode was introduced - protected mode. -* Segmentation -* Paging +Protected mode was introduced to the x86 architecture in 1982 and became the primary operating mode of Intel processors, starting with the [80286](http://en.wikipedia.org/wiki/Intel_80286) until the introduction of x86_64 and long mode. This mode brought many changes and improvements, but one of the most crucial was in memory management. The 20-bit address bus was replaced with a 32-bit address bus. It allowed access to 4 Gigabytes of memory vs the 1 Megabyte in real mode. -Here we will only talk about segmentation. Paging will be discussed in the next sections. +Memory management in protected mode is divided into two, mostly independent mechanisms: -As you can read in the previous part, addresses consist of two parts in Real mode: +- `Segmentation` +- `Paging` -* Base address of the segment -* Offset from the segment base +For now, our attention stays on segmentation. We’ll return to paging later, once we enter 64-bit long mode. -And we can get the physical address if we know these two parts by: +### Memory segmentation in protected mode -``` -PhysicalAddress = Segment Base * 16 + Offset -``` +In protected mode, memory segmentation was completely redesigned. Fixed 64 KB real mode segments are gone. Instead, each segment is now defined by a special data structure called a `Segment Descriptor` which specifies the properties of a memory segment. The segment descriptors are stored in another special structure called `Global Descriptor Table` or `GDT`. Whenever a CPU needs to find an actual physical memory address, it consults this table. The GDT itself is just a block of memory which address is stored in the special CPU register called `gdtr`. This is a 48-bit register and consists of two parts: -Memory segmentation was completely redone in protected mode. There are no 64 Kilobyte fixed-size segments. Instead, the size and location of each segment is described by an associated data structure called the _Segment Descriptor_. These segment descriptors are stored in a data structure called the `Global Descriptor Table` (GDT). +- The size of the Global Descriptor Table +- The address of the Global Descriptor Table -The GDT is a structure which resides in memory. It has no fixed place in the memory, so its address is stored in the special `GDTR` register. Later we will see how the GDT is loaded in the Linux kernel code. There will be an operation for loading it from memory, something like: +Later, we will see exactly how the Linux kernel builds and loads its GDT. For now, it’s enough to know that the CPU provides a dedicated instruction to load the table’s address into the GDTR register: ```assembly lgdt gdt ``` -where the `lgdt` instruction loads the base address and limit(size) of the global descriptor table to the `GDTR` register. `GDTR` is a 48-bit register and consists of two parts: - - * the size(16-bit) of the global descriptor table; - * the address(32-bit) of the global descriptor table. - -As mentioned above, the GDT contains `segment descriptors` which describe memory segments. Each descriptor is 64-bits in size. The general scheme of a descriptor is: - -``` - 63 56 51 48 45 39 32 ------------------------------------------------------------- -| | |B| |A| | | | |0|E|W|A| | -| BASE 31:24 |G|/|L|V| LIMIT |P|DPL|S| TYPE | BASE 23:16 | -| | |D| |L| 19:16 | | | |1|C|R|A| | ------------------------------------------------------------- - - 31 16 15 0 ------------------------------------------------------------- -| | | -| BASE 15:0 | LIMIT 15:0 | -| | | ------------------------------------------------------------- -``` - -Don't worry, I know it looks a little scary after Real mode, but it's easy. For example LIMIT 15:0 means that bits 0-15 of the segment limit are located at the beginning of the Descriptor. The rest of it is in LIMIT 19:16, which is located at bits 48-51 of the Descriptor. So, the size of Limit is 0-19 i.e 20-bits. Let's take a closer look at it: - -1. Limit[20-bits] is split between bits 0-15 and 48-51. It defines the `length_of_segment - 1`. It depends on the `G`(Granularity) bit. +As mentioned above, the GDT contains `segment descriptors` which describe memory segments. Now let's see how segment descriptors look like. Each descriptor is 64-bits in size. The general scheme of a descriptor is: - * if `G` (bit 55) is 0 and the segment limit is 0, the size of the segment is 1 Byte - * if `G` is 1 and the segment limit is 0, the size of the segment is 4096 Bytes - * if `G` is 0 and the segment limit is 0xfffff, the size of the segment is 1 Megabyte - * if `G` is 1 and the segment limit is 0xfffff, the size of the segment is 4 Gigabytes +![segment-descriptor](./images/segment-descriptor.svg) - So, what this means is - * if G is 0, Limit is interpreted in terms of 1 Byte and the maximum size of the segment can be 1 Megabyte. - * if G is 1, Limit is interpreted in terms of 4096 Bytes = 4 KBytes = 1 Page and the maximum size of the segment can be 4 Gigabytes. Actually, when G is 1, the value of Limit is shifted to the left by 12 bits. So, 20 bits + 12 bits = 32 bits and 232 = 4 Gigabytes. +Do not worry! I know it may look a little bit intimidating at the first glance, especially in comparison to the relatively simple addressing in real mode, but we will go through it in details. We will start from the bottom, from right to left. -2. Base[32-bits] is split between bits 16-31, 32-39 and 56-63. It defines the physical address of the segment's starting location. +The first field is `LIMIT 15:0`. It represents the first 16 bits of the segment limit. The second part is located at the bits `51:48`. This field provides information about the size of a segment. Having 20-bit size of the limit field, it may seem that the max size of a memory segment can be 1 MB, but it is not like that. In addition, the max size of a segment depends on the 55th `G` bit: -3. Type/Attribute[5-bits] is represented by bits 40-44. It defines the type of segment and how it can be accessed. - * The `S` flag at bit 44 specifies the descriptor type. If `S` is 0 then this segment is a system segment, whereas if `S` is 1 then this is a code or data segment (Stack segments are data segments which must be read/write segments). +- If `G=0` - the value of the `LIMIT` field is interpreted in bytes. +- if `G=1` - the value of the `LIMIT` field is interpreted in 4 KB units called pages. -To determine if the segment is a code or data segment, we can check its Ex(bit 43) Attribute (marked as 0 in the above diagram). If it is 0, then the segment is a Data segment, otherwise, it is a code segment. +Based on this, we can easily calculate that the max size of a segment is 4 GB. -A segment can be of one of the following types: +The next field is `BASE`. We may see that it is split on three parts. The first part occupies bits from 16 to 31, the second part occupies bits from 32 to 39, and the last third part occupies bits from 56 to 63. The main goal of this field is to store the base address of a segment. -``` --------------------------------------------------------------------------------------- -| Type Field | Descriptor Type | Description | -|-----------------------------|-----------------|------------------------------------| -| Decimal | | | -| 0 E W A | | | -| 0 0 0 0 0 | Data | Read-Only | -| 1 0 0 0 1 | Data | Read-Only, accessed | -| 2 0 0 1 0 | Data | Read/Write | -| 3 0 0 1 1 | Data | Read/Write, accessed | -| 4 0 1 0 0 | Data | Read-Only, expand-down | -| 5 0 1 0 1 | Data | Read-Only, expand-down, accessed | -| 6 0 1 1 0 | Data | Read/Write, expand-down | -| 7 0 1 1 1 | Data | Read/Write, expand-down, accessed | -| C R A | | | -| 8 1 0 0 0 | Code | Execute-Only | -| 9 1 0 0 1 | Code | Execute-Only, accessed | -| 10 1 0 1 0 | Code | Execute/Read | -| 11 1 0 1 1 | Code | Execute/Read, accessed | -| 12 1 1 0 0 | Code | Execute-Only, conforming | -| 14 1 1 0 1 | Code | Execute-Only, conforming, accessed | -| 13 1 1 1 0 | Code | Execute/Read, conforming | -| 15 1 1 1 1 | Code | Execute/Read, conforming, accessed | --------------------------------------------------------------------------------------- -``` - -As we can see the first bit(bit 43) is `0` for a _data_ segment and `1` for a _code_ segment. The next three bits (40, 41, 42) are either `EWA`(*E*xpansion *W*ritable *A*ccessible) or CRA(*C*onforming *R*eadable *A*ccessible). - * if E(bit 42) is 0, expand up, otherwise, expand down. Read more [here](http://www.sudleyplace.com/dpmione/expanddown.html). - * if W(bit 41)(for Data Segments) is 1, write access is allowed, and if it is 0, the segment is read-only. Note that read access is always allowed on data segments. - * A(bit 40) controls whether the segment can be accessed by the processor or not. - * C(bit 43) is the conforming bit(for code selectors). If C is 1, the segment code can be executed from a lower level privilege (e.g. user) level. If C is 0, it can only be executed from the same privilege level. - * R(bit 41) controls read access to code segments; when it is 1, the segment can be read from. Write access is never granted for code segments. - -4. DPL[2-bits] (Descriptor Privilege Level) comprises the bits 45-46. It defines the privilege level of the segment. It can be 0-3 where 0 is the most privileged level. - -5. The P flag(bit 47) indicates if the segment is present in memory or not. If P is 0, the segment will be presented as _invalid_ and the processor will refuse to read from this segment. +The remaining of the fields in a segment descriptor represent flags which control different aspects of a segment, like for example type of a memory. Let's take a look at the description of these flags: -6. AVL flag(bit 52) - Available and reserved bits. It is ignored in Linux. +- `Type` - describes the type of a memory segment. +- `S` - distinguishes system segments from code and data segments. +- `DPL` - provides information about the privilege level of a segment. It can be a value from 0 to 3, where 0 is the most privileged level. +- `P` - tells the CPU whether a segment presented in memory. +- `AVL` - available and reserved bits. It is ignored by the Linux kernel. +- `L` - indicates whether a code segment contains 64-bit code. +- `D / B` - provides different meaning depends on the type of a segment. + - For a code segment: Controls the default operand and address size. If the bit is clear, it is a 16-bit code segment. Otherwise it is a 32-bit code segment. + - For a stack segment or in other words a data segment pointed by the `ss` register: Controls the default stack pointer size. If the bit is clear, it is a 16-bit stack segment and stack operations use `sp` register. Otherwise it is a 32-bit stack segment and stack operations use `esp` register. + - For a expand-down data segment: Specifies the upper bound of the segment. If the bit is clear, the upper bound is `0xFFFF` or 64 KB. Otherwise, it is `0xFFFFFFFF` or 4 GB. -7. The L flag(bit 53) indicates whether a code segment contains native 64-bit code. If it is set, then the code segment executes in 64-bit mode. +If the `S` flag of a segment descriptor is set, the descriptor describes either a code or a data segment, otherwise it is a system segment. If the highest order bit of the `Type` flags is clear - this descriptor describes a data segment, otherwise a code segment. Rest of the three bits of a data segment descriptor interpreted as: -8. The D/B flag(bit 54) (Default/Big flag) represents the operand size i.e 16/32 bits. If set, operand size is 32 bits. Otherwise, it is 16 bits. - -Segment registers contain segment selectors as in real mode. However, in protected mode, a segment selector is handled differently. Each Segment Descriptor has an associated Segment Selector which is a 16-bit structure: - -``` - 15 3 2 1 0 ------------------------------ -| Index | TI | RPL | ------------------------------ -``` +- `Accessed` - indicates whether a segment has been accessed since the last time the kernel cleared this bit. +- `Write-Enable` - determines whether a segment is writable or read-only. +- `Expansion-Direction` - determines whether addresses decreasing from the base address or not. -Where, -* **Index** stores the index number of the descriptor in the GDT. -* **TI**(Table Indicator) indicates where to search for the descriptor. If it is 0 then the descriptor is searched for in the Global Descriptor Table(GDT). Otherwise, it will be searched for in the Local Descriptor Table(LDT). -* And **RPL** contains the Requester's Privilege Level. +For a code segment, these three bits interpreted as: -Every segment register has a visible and a hidden part. -* Visible - The Segment Selector is stored here. -* Hidden - The Segment Descriptor (which contains the base, limit, attributes & flags) is stored here. +- `Accessed` - indicates whether a segment has been accessed since the last time the kernel cleared this bit. +- `Read-Enable` - determines whether a segment is execute-only or execute-read. +- `Confirming` - determines how privilege level changes are handled when transferring execution to that segment. -The following steps are needed to get a physical address in protected mode: +In the tables below you can find full information about possible states of the flags for a code and a data segments. -* The segment selector must be loaded in one of the segment registers. -* The CPU tries to find a segment descriptor at the offset `GDT address + Index` from the selector and then loads the descriptor into the *hidden* part of the segment register. -* If paging is disabled, the linear address of the segment, or its physical address, is given by the formula: Base address (found in the descriptor obtained in the previous step) + Offset. +A data segment `Type` field: -Schematically it will look like this: +| E (Expand-Down) | W (Writable) | A (Accessed) | Description | +| --------------- | ------------ | ------------ | --------------------------------- | +| 0 | 0 | 0 | Read-Only | +| 0 | 0 | 1 | Read-Only, accessed | +| 0 | 1 | 0 | Read/Write | +| 0 | 1 | 1 | Read/Write, accessed | +| 1 | 0 | 0 | Read-Only, expand-down | +| 1 | 0 | 1 | Read-Only, expand-down, accessed | +| 1 | 1 | 0 | Read/Write, expand-down | +| 1 | 1 | 1 | Read/Write, expand-down, accessed | -![linear address](images/linear_address.png) +A code segment `Type` field: -The algorithm for the transition from real mode into protected mode is: +| C (Conforming) | R (Readable) | A (Accessed) | Description | +| -------------- | ------------ | ------------ | ---------------------------------- | +| 0 | 0 | 0 | Execute-Only | +| 0 | 0 | 1 | Execute-Only, accessed | +| 0 | 1 | 0 | Execute/Read | +| 0 | 1 | 1 | Execute/Read, accessed | +| 1 | 0 | 0 | Execute-Only, conforming | +| 1 | 1 | 0 | Execute/Read, conforming | +| 1 | 0 | 1 | Execute-Only, conforming, accessed | +| 1 | 1 | 1 | Execute/Read, conforming, accessed | -* Disable interrupts -* Describe and load the GDT with the `lgdt` instruction -* Set the PE (Protection Enable) bit in CR0 (Control Register 0) -* Jump to protected mode code +So far, we’ve looked at how a segment descriptor defines the properties of a memory segment — its base, limit, type, and different flags. But how does the CPU actually refer to one of these descriptors during execution? Just like in real mode - using segment registers. In protected mode they contain segment selectors. However, in protected mode, a segment selector is handled differently. Each segment descriptor has an associated segment selector which is a 16-bit structure: -We will see the complete transition to protected mode in the linux kernel in the next part, but before we can move to protected mode, we need to do some more preparations. +![segment-selector](./images/segment-selector.svg) -Let's look at [arch/x86/boot/main.c](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/main.c). We can see some routines there which perform keyboard initialization, heap initialization, etc... Let's take a look. +The meaning of the fields is: -Copying boot parameters into the "zeropage" --------------------------------------------------------------------------------- +- `Index` - the entry number of the descriptor in the descriptor table. +- `TI` - indicates where to search for the descriptor + - If the value of the bit is `0`, a descriptor will be searched in the Global Descriptor Table. + - If the value of this bit is `1`, a descriptor will be searched in the Local Descriptor Table. +- `RPL` - the privilege level requested by the selector. -We will start from the `main` routine in "main.c". The first function which is called in `main` is [`copy_boot_params(void)`](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/main.c). It copies the kernel setup header into the corresponding field of the `boot_params` structure which is defined in the [arch/x86/include/uapi/asm/bootparam.h](https://github.com/torvalds/linux/blob/v4.16/arch/x86/include/uapi/asm/bootparam.h) header file. +When a program running in protected mode references a memory, the CPU need to calculate a proper physical address. The following steps are needed to get a physical address in protected mode: -The `boot_params` structure contains the `struct setup_header hdr` field. This structure contains the same fields as defined in the [linux boot protocol](https://www.kernel.org/doc/Documentation/x86/boot.txt) and is filled by the boot loader and also at kernel compile/build time. `copy_boot_params` does two things: +1. A segment selector is loaded into one of the segment registers. +2. The CPU tries to find a associated segment descriptor in the Global Descriptor Table based on the `Index` value from the segment selector. If the descriptor was found, it is loaded into a special hidden part of this segment register. +3. The physical address will be the base address from the segment descriptor plus offset from the instruction pointer or memory location referenced within an executed instruction. -1. It copies `hdr` from [header.S](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/header.S#L280) to the `setup_header` field in `boot_params` structure. +In the next part, we will see the transition into protected mode. But before the kernel can be switched to protected mode, we need to do some more preparations. -2. It updates the pointer to the kernel command line if the kernel was loaded with the old command line protocol. +Let's continue from the point where we have stopped in the previous chapter. -Note that it copies `hdr` with the `memcpy` function, defined in the [copy.S](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/copy.S) source file. Let's have a look inside: +## Back to the Kernel: Entering main.c -```assembly -GLOBAL(memcpy) - pushw %si - pushw %di - movw %ax, %di - movw %dx, %si - pushw %cx - shrw $2, %cx - rep; movsl - popw %cx - andw $3, %cx - rep; movsb - popw %di - popw %si - retl -ENDPROC(memcpy) -``` +As we already have mentioned in the beginning of this chapter, one of the kernel's first main goals is to switch the processor into protected mode. But before this can happen, the kernel need to do some preparations. -Yeah, we just moved to C code and now assembly again :) First of all, we can see that `memcpy` and other routines which are defined here, start and end with the two macros: `GLOBAL` and `ENDPROC`. `GLOBAL` is described in [arch/x86/include/asm/linkage.h](https://github.com/torvalds/linux/blob/v4.16/arch/x86/include/asm/linkage.h) which defines the `globl` directive and its label. `ENDPROC` is described in [include/linux/linkage.h](https://github.com/torvalds/linux/blob/v4.16/include/linux/linkage.h) and marks the `name` symbol as a function name and ends with the size of the `name` symbol. +If we look at the very beginning of the `main` function from the [arch/x86/boot/main.c](https://github.com/torvalds/linux/blob/master/arch/x86/boot/main.c), the very first thing we will see is a call of the `init_default_io_ops` function. -The implementation of `memcpy` is simple. At first, it pushes values from the `si` and `di` registers to the stack to preserve their values because they will change during the `memcpy`. As we can see in the `REALMODE_CFLAGS` in `arch/x86/Makefile`, the kernel build system uses the `-mregparm=3` option of GCC, so functions get the first three parameters from `ax`, `dx` and `cx` registers. Calling `memcpy` looks like this: +This function defined in the [arch/x86/boot/io.h](https://github.com/torvalds/linux/blob/master/arch/x86/boot/io.h) and looks like: -```c -memcpy(&boot_params.hdr, &hdr, sizeof hdr); + +```C +static inline void init_default_io_ops(void) +{ + pio_ops.f_inb = __inb; + pio_ops.f_outb = __outb; + pio_ops.f_outw = __outw; +} ``` -So, -* `ax` will contain the address of `boot_params.hdr` -* `dx` will contain the address of `hdr` -* `cx` will contain the size of `hdr` in bytes. - -`memcpy` puts the address of `boot_params.hdr` into `di` and saves `cx` on the stack. After this it shifts the value right 2 times (or divides it by 4) and copies four bytes from the address at `si` to the address at `di`. After this, we restore the size of `hdr` again, align it by 4 bytes and copy the rest of the bytes from the address at `si` to the address at `di` byte by byte (if there is more). Now the values of `si` and `di` are restored from the stack and the copying operation is finished. - -Console initialization --------------------------------------------------------------------------------- - -After `hdr` is copied into `boot_params.hdr`, the next step is to initialize the console by calling the `console_init` function, defined in [arch/x86/boot/early_serial_console.c](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/early_serial_console.c). - -It tries to find the `earlyprintk` option in the command line and if the search was successful, it parses the port address and baud rate of the serial port and initializes the serial port. The value of the `earlyprintk` command line option can be one of these: +This function initializes function pointers for: -* serial,0x3f8,115200 -* serial,ttyS0,115200 -* ttyS0,115200 +- reading a byte from an I/O port +- writing a byte to an I/O port +- writing a word (16-bit) to an I/O port -After serial port initialization we can see the first output: +These callbacks will be used to write data to the serial console which will be initialized at the one of the next steps. All the operations will be executed with the help of the `inb`, `outb`, and `outw` macros which defined in the same file: + ```C -if (cmdline_find_option_bool("debug")) - puts("early console in setup code\n"); +#define inb pio_ops.f_inb +#define outb pio_ops.f_outb +#define outw pio_ops.f_outw ``` -The definition of `puts` is in [tty.c](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/tty.c). As we can see it prints character by character in a loop by calling the `putchar` function. Let's look into the `putchar` implementation: +The `__inb`, `__outb`, and `__outw` themselves are inline functions from the [arch/x86/include/asm/shared/io.h](https://github.com/torvalds/linux/blob/master/arch/x86/include/asm/shared/io.h): + ```C -void __attribute__((section(".inittext"))) putchar(int ch) -{ - if (ch == '\n') - putchar('\r'); - - bios_putchar(ch); - - if (early_serial_base != 0) - serial_putchar(ch); +#define BUILDIO(bwl, bw, type) \ +static __always_inline void __out##bwl(type value, u16 port) \ +{ \ + asm volatile("out" #bwl " %" #bw "0, %w1" \ + : : "a"(value), "Nd"(port)); \ +} \ + \ +static __always_inline type __in##bwl(u16 port) \ +{ \ + type value; \ + asm volatile("in" #bwl " %w1, %" #bw "0" \ + : "=a"(value) : "Nd"(port)); \ + return value; \ } + +BUILDIO(b, b, u8) +BUILDIO(w, w, u16) +BUILDIO(l, , u32) ``` -`__attribute__((section(".inittext")))` means that this code will be in the `.inittext` section. We can find it in the linker file [setup.ld](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/setup.ld). +All of these functions use `in` and `out` assembly instructions which send the given value to the given port or read the value from the given port. If the syntax is not familiar to you, you can read the chapter about [inline assembly](https://github.com/0xAX/linux-insides/blob/master/Theory/linux-theory-3.md). -First of all, `putchar` checks for the `\n` symbol and if it is found, prints `\r` before. After that it prints the character on the VGA screen by calling the BIOS with the `0x10` interrupt call: +After initialization of callbacks for writing to a serial port, the next step is copying of the kernel setup header filled by a bootloader into the corresponding field of the C `boot_params` structure. This will make the fields from the kernel setup header more easily accessible. All the job by copying handled by the `copy_boot_params` function with the help of `memcpy`: + ```C -static void __attribute__((section(".inittext"))) bios_putchar(int ch) -{ - struct biosregs ireg; - - initregs(&ireg); - ireg.bx = 0x0007; - ireg.cx = 0x0001; - ireg.ah = 0x0e; - ireg.al = ch; - intcall(0x10, &ireg, NULL); -} + memcpy(&boot_params.hdr, &hdr, sizeof(hdr)); ``` -Here `initregs` takes the `biosregs` structure and first fills `biosregs` with zeros using the `memset` function and then fills it with register values. +Do not mix this `memcpy` with the function from the C standard library - [memcpy](https://man7.org/linux/man-pages/man3/memcpy.3.html). During the time when the kernel is in the early initialization phase, there is no way to load any library. For this reason, an operating system kernel provides own implementation of such functions. The kernel's `memcpy` defined in the [copy.S](https://github.com/torvalds/linux/blob/master/arch/x86/boot/copy.S). If you already started to miss an assembly code, this is the high time to bring some back: -```C - memset(reg, 0, sizeof *reg); - reg->eflags |= X86_EFLAGS_CF; - reg->ds = ds(); - reg->es = ds(); - reg->fs = fs(); - reg->gs = gs(); + +```assembly +SYM_FUNC_START_NOALIGN(memcpy) + pushw %si + pushw %di + movw %ax, %di + movw %dx, %si + pushw %cx + shrw $2, %cx + rep movsl + popw %cx + andw $3, %cx + rep movsb + popw %di + popw %si + retl +SYM_FUNC_END(memcpy) ``` -Let's look at the implementation of [memset](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/copy.S#L36): +First of all, we can see that `memcpy` and other routines which are defined there, start and end with the two macros - `SYM_FUNC_START_NOALIGN` and `SYM_FUNC_END`. The `SYM_FUNC_START_NOALIGN` just specifies the given symbol name as [.globl](https://sourceware.org/binutils/docs/as.html#Global) to make it visible for other functions. The `SYM_FUNC_END` just expands to an empty string in our case. -```assembly -GLOBAL(memset) - pushw %di - movw %ax, %di - movzbl %dl, %eax - imull $0x01010101,%eax - pushw %cx - shrw $2, %cx - rep; stosl - popw %cx - andw $3, %cx - rep; stosb - popw %di - retl -ENDPROC(memset) -``` +Despite the implementation of this function is written in assembly language, the implementation of `memcpy` is relatively simple. At first, it pushes values from the `si` and `di` registers to the stack to preserve their values because they will change during the `memcpy` execution. At the next step we may see handling of the function's parameters. The parameters of this function are passed through the `ax`, `dx`, and `cx` registers. This is because the kernel setup code is built with `-mregparm=3` option. So: -As you can read above, it uses the same calling conventions as the `memcpy` function, which means that the function gets its parameters from the `ax`, `dx` and `cx` registers. +- `ax` will contain the address of `boot_params.hdr` +- `dx` will contain the address of `hdr` +- `cx` will contain the size of `hdr` in bytes -The implementation of `memset` is similar to that of memcpy. It saves the value of the `di` register on the stack and puts the value of`ax`, which stores the address of the `biosregs` structure, into `di` . Next is the `movzbl` instruction, which copies the value of `dl` to the lowermost byte of the `eax` register. The remaining 3 high bytes of `eax` will be filled with zeros. +The `rep movsl` instruction copies bytes from the memory pointed by the `si` register to the memory location pointed by the `di` register. At each iteration 4 bytes copied. For this reason we divided the size of the setup header by 4 using `shrw` instruction. After this step we just copy rest of bytes that is not divided by 4. -The next instruction multiplies `eax` with `0x01010101`. It needs to because `memset` will copy 4 bytes at the same time. For example, if we need to fill a structure whose size is 4 bytes with the value `0x7` with memset, `eax` will contain the `0x00000007`. So if we multiply `eax` with `0x01010101`, we will get `0x07070707` and now we can copy these 4 bytes into the structure. `memset` uses the `rep; stosl` instruction to copy `eax` into `es:di`. +From this point, the setup header is copied into a proper place and we can move forward. -The rest of the `memset` function does almost the same thing as `memcpy`. +### Console initialization -After the `biosregs` structure is filled with `memset`, `bios_putchar` calls the [0x10](http://www.ctyme.com/intr/rb-0106.htm) interrupt which prints a character. Afterwards it checks if the serial port was initialized or not and writes a character there with [serial_putchar](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/tty.c) and `inb/outb` instructions if it was set. +As soon as the kernel setup header is copied into the `boot_params.hdr`, the next step is to initialize the serial console by calling the `console_init` function. Very soon we will be able to print something from within the kernel code! -Heap initialization --------------------------------------------------------------------------------- +The `console_init` defined in [arch/x86/boot/early_serial_console.c](https://github.com/torvalds/linux/blob/master/arch/x86/boot/early_serial_console.c). At the very first step it tries to find the `earlyprintk` option in the kernel's command line. If the search was successful, it parses the port address and [baud rate](https://en.wikipedia.org/wiki/Baud) and executes the initialization of the serial port. -After the stack and bss section have been prepared in [header.S](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/header.S) (see previous [part](linux-bootstrap-1.md)), the kernel needs to initialize the [heap](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/main.c) with the [`init_heap`](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/main.c) function. +> [!NOTE] +> If you want to know what else options you can pass in the kernel command line, you can find more information in the [The kernel's command-line parameters](https://github.com/torvalds/linux/blob/master/Documentation/admin-guide/kernel-parameters.rst) document. -First of all `init_heap` checks the [`CAN_USE_HEAP`](https://github.com/torvalds/linux/blob/v4.16/arch/x86/include/uapi/asm/bootparam.h#L24) flag from the [`loadflags`](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/header.S#L320) structure in the kernel setup header and calculates the end of the stack if this flag was set: +Let's take a look at these two steps in details. -```C - char *stack_end; +The possible values of the `earlyprintk` command line option are: - if (boot_params.hdr.loadflags & CAN_USE_HEAP) { - asm("leal %P1(%%esp),%0" - : "=r" (stack_end) : "i" (-STACK_SIZE)); -``` +- `serial,0x3f8,115200` +- `serial,ttyS0,115200` +- `ttyS0,115200` -or in other words `stack_end = esp - STACK_SIZE`. +The parameters defines the name of a serial port, the port number and the baud rate. The pointer to the kernel command line is stored in the kernel setup header and can be accessed through `boot_params.hdr.cmd_line_ptr`. The `parse_earlyprintk` function tries to find the `earlyprintk` option in the kernel command line, parse it if it was found and initialize the serial console parameters with one of the values above. If the `earlyprintk` option is given and contains valid values, the initialization of the serial console takes place in the `early_serial_init` function. There is nothing specific to Linux kernel in the initialization of a serial console, so we will skip this part. If you want to dive deeper by yourself, more information you can find [here](https://wiki.osdev.org/Serial_Ports#Port_Addresses) and learn [arch/x86/boot/early_serial_console.c](https://github.com/torvalds/linux/blob/master/arch/x86/boot/early_serial_console.c) step by step. -Then there is the `heap_end` calculation: +After the serial port initialization we can see the first output: + ```C - heap_end = (char *)((size_t)boot_params.hdr.heap_end_ptr + 0x200); + if (cmdline_find_option_bool("debug")) + puts("early console in setup code\n"); ``` -which means `heap_end_ptr` or `_end` + `512` (`0x200h`). The last check is whether `heap_end` is greater than `stack_end`. If it is then `stack_end` is assigned to `heap_end` to make them equal. +The `puts` function uses the `inb` function that we have seen above during initialization of I/O callbacks. -Now the heap is initialized and we can use it using the `GET_HEAP` method. We will see what it is used for, how to use it and how it is implemented in the next posts. +From this point we can print messages from the kernel setup code 🎉. Time to move to the next step. -CPU validation --------------------------------------------------------------------------------- +### Heap initialization -The next step as we can see is cpu validation through the `validate_cpu` function from [arch/x86/boot/cpu.c](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/cpu.c) source code file. - -It calls the [`check_cpu`](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/cpucheck.c) function and passes cpu level and required cpu level to it and checks that the kernel launches on the right cpu level. +We have seen the initialization of the `stack` and `bss` memory areas in the previous chapter. The next step is to initialize the [heap](https://en.wikipedia.org/wiki/Memory_management#HEAP) memory area. The heap initialization takes place in the `init_heap` function: + ```C -check_cpu(&cpu_level, &req_level, &err_flags); -if (cpu_level < req_level) { - ... - return -1; +static void init_heap(void) +{ + char *stack_end; + + if (boot_params.hdr.loadflags & CAN_USE_HEAP) { + stack_end = (char *) (current_stack_pointer - STACK_SIZE); + heap_end = (char *) ((size_t)boot_params.hdr.heap_end_ptr + 0x200); + if (heap_end > stack_end) + heap_end = stack_end; + } else { + /* Boot protocol 2.00 only, no heap available */ + puts("WARNING: Ancient bootloader, some functionality may be limited!\n"); + } } ``` -The `check_cpu` function checks the CPU's flags, the presence of [long mode](http://en.wikipedia.org/wiki/Long_mode) in the case of x86_64(64-bit) CPU, checks the processor's vendor and makes preparations for certain vendors like turning off SSE+SSE2 for AMD if they are missing, etc. +First of all, `init_heap` checks the `CAN_USE_HEAP` flag from the kernel setup header. If it is not set, we'll see the warning message. If heap is enabled, the last address of it is set to the `boot_params.hdr.heap_end_ptr` filled by bootloader plus 512 bytes or to the end of the stack if the value specified by bootloader is above it. The beginning of the heap is right after the end of the `.bss` area. The stack size is 1024 bytes. Thereby, the memory map will look like: + +![early-heap](./images/early-heap.svg) + +Now the heap is initialized, although we will see the usage of it in the next chapters. + +### CPU validation + +The next step is the validation of CPU on which the kernel is running. The kernel has to do it to make sure that the all required functionalities will work correctly on the given CPU. -at the next step, we may see a call to the `set_bios_mode` function after setup code found that a CPU is suitable. As we may see, this function is implemented only for the `x86_64` mode: +The `validate_cpu` function from [arch/x86/boot/cpu.c](https://github.com/torvalds/linux/blob/master/arch/x86/boot/cpu.c) validates the CPU. This function calls the [`check_cpu`](https://github.com/torvalds/linux/blob/master/arch/x86/boot/cpucheck.c) which check the CPU model and its flags using the [cpuid](https://en.wikipedia.org/wiki/CPUID) instruction. The CPU's flags are checked like the presence of [long mode](http://en.wikipedia.org/wiki/Long_mode), checks the processor's vendor and makes preparations for certain vendors like turning on extensions like [SSE+SSE2](https://en.wikipedia.org/wiki/Single_instruction,_multiple_data): + ```C -static void set_bios_mode(void) +int validate_cpu(void) { -#ifdef CONFIG_X86_64 - struct biosregs ireg; - - initregs(&ireg); - ireg.ax = 0xec00; - ireg.bx = 2; - intcall(0x15, &ireg, NULL); -#endif -} + u32 *err_flags; + int cpu_level, req_level; + + check_cpu(&cpu_level, &req_level, &err_flags); + + if (cpu_level < req_level) { + printf("This kernel requires an %s CPU, ", + cpu_name(req_level)); + printf("but only detected an %s CPU.\n", + cpu_name(cpu_level)); + return -1; + } ``` -The `set_bios_mode` function executes the `0x15` BIOS interrupt to tell the BIOS that [long mode](https://en.wikipedia.org/wiki/Long_mode) (if `bx == 2`) will be used. +If the level of CPU is less than the required level specified by the `CONFIG_X86_MINIMUM_CPU_FAMILY` kernel configuration option, the function returns the error and the kernel setup process is aborted. -Memory detection --------------------------------------------------------------------------------- +### Memory detection -The next step is memory detection through the `detect_memory` function. `detect_memory` basically provides a map of available RAM to the CPU. It uses different programming interfaces for memory detection like `0xe820`, `0xe801` and `0x88`. We will see only the implementation of the **0xE820** interface here. +After the kernel became sure that the CPU which it is running on is suitable, the next stage is to detect available memory in the system. This task is handled by the `detect_memory` function, which queries the system firmware to obtain a map of physical memory regions. To do this, the kernel uses the special BIOS service - `0xE820`, but kernel can fallback to legacy BIOS services like `0xE801` or `0x88`. In this chapter, we will see only the implementation of the `0xE820` interface. -Let's look at the implementation of the `detect_memory_e820` function from the [arch/x86/boot/memory.c](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/memory.c) source file. First of all, the `detect_memory_e820` function initializes the `biosregs` structure as we saw above and fills registers with special values for the `0xe820` call: +The `detect_memory` function defined in the [arch/x86/boot/memory.c](https://github.com/torvalds/linux/blob/master/arch/x86/boot/memory.c) and as just mentioned, tries to get the information about available memory: -```assembly - initregs(&ireg); - ireg.ax = 0xe820; - ireg.cx = sizeof buf; - ireg.edx = SMAP; - ireg.di = (size_t)&buf; -``` + +```C +void detect_memory(void) +{ + detect_memory_e820(); + + detect_memory_e801(); -* `ax` contains the number of the function (0xe820 in our case) -* `cx` contains the size of the buffer which will contain data about the memory -* `edx` must contain the `SMAP` magic number -* `es:di` must contain the address of the buffer which will contain memory data -* `ebx` has to be zero. + detect_memory_88(); +} +``` -Next is a loop where data about the memory will be collected. It starts with a call to the `0x15` BIOS interrupt, which writes one line from the address allocation table. For getting the next line we need to call this interrupt again (which we do in the loop). Before the next call `ebx` must contain the value returned previously: +Let's look at the crucial part of the implementation of the `detect_memory_e820` function. First of all, the `detect_memory_e820` function initializes the `biosregs` structure with the special values related to the `0xE820` BIOS interface: + ```C - intcall(0x15, &ireg, &oreg); - ireg.ebx = oreg.ebx; + initregs(&ireg); + ireg.ax = 0xe820; + ireg.cx = sizeof(buf); + ireg.edx = SMAP; + ireg.di = (size_t)&buf; ``` -Ultimately, this function collects data from the address allocation table and writes this data into the `e820_entry` array: +- `ax` register contains the number of the BIOS service +- `cx` register contains the size of the buffer which will contain the data about available memory +- `di` register contain the address of the buffer which will contain memory data +- `edx` register contains the `SMAP` magic number -* start of memory segment -* size of memory segment -* type of memory segment (whether the particular segment is usable or reserved) +After registers filled with the needed values, the kernel can ask the `0xE820` BIOS interface about available memory. The kernel does it by the invoking `0x15` [BIOS interrupt](https://en.wikipedia.org/wiki/BIOS_interrupt_call) which returns information about one memory region. The kernel repeats this operation in the loop until information about all the memory regions is not collected. -You can see the result of this in the `dmesg` output, something like: +After the information is called, the kernel print message about the available memory regions. You can find it in the [dmesg](https://en.wikipedia.org/wiki/Dmesg) output: ``` [ 0.000000] e820: BIOS-provided physical RAM map: @@ -424,87 +369,79 @@ You can see the result of this in the `dmesg` output, something like: [ 0.000000] BIOS-e820: [mem 0x00000000fffc0000-0x00000000ffffffff] reserved ``` -Keyboard initialization --------------------------------------------------------------------------------- +### Keyboard initialization -The next step is the initialization of the keyboard with a call to the [`keyboard_init`](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/main.c) function. At first `keyboard_init` initializes registers using the `initregs` function. It then calls the [0x16](http://www.ctyme.com/intr/rb-1756.htm) interrupt to query the status of the keyboard. +Once memory detection is complete, the kernel proceeds with initializing the keyboard using the `keyboard_init`: -```c - initregs(&ireg); - ireg.ah = 0x02; /* Get keyboard status */ - intcall(0x16, &ireg, &oreg); - boot_params.kbd_status = oreg.al; -``` + +```C +static void keyboard_init(void) +{ + struct biosregs ireg, oreg; + + initregs(&ireg); -After this it calls [0x16](http://www.ctyme.com/intr/rb-1757.htm) again to set the repeat rate and delay. + ireg.ah = 0x02; /* Get keyboard status */ + intcall(0x16, &ireg, &oreg); + boot_params.kbd_status = oreg.al; -```c - ireg.ax = 0x0305; /* Set keyboard repeat rate */ - intcall(0x16, &ireg, NULL); + ireg.ax = 0x0305; /* Set keyboard repeat rate */ + intcall(0x16, &ireg, NULL); +} ``` -Querying --------------------------------------------------------------------------------- +This function performs two tasks using [BIOS interrupt](https://en.wikipedia.org/wiki/BIOS_interrupt_call) `0x16`: -The next couple of steps are queries for different parameters. We will not dive into details about these queries but we will get back to them in later parts. Let's take a short look at these functions: +1. Gets the state of a keyboard which contains information about state of certain modifier keys, like for example Caps Lock active or not. +2. Sets the keyboard repeat rate which determines how long a key must hold down before it begins repeating -The first step is getting [Intel SpeedStep](http://en.wikipedia.org/wiki/SpeedStep) information by calling the `query_ist` function. It checks the CPU level and if it is correct, calls `0x15` to get the info and saves the result to `boot_params`. +### Gathering system information -Next, the [query_apm_bios](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/apm.c#L21) function gets [Advanced Power Management](http://en.wikipedia.org/wiki/Advanced_Power_Management) information from the BIOS. `query_apm_bios` calls the `0x15` BIOS interruption too, but with `ah` = `0x53` to check `APM` installation. After `0x15` finishes executing, the `query_apm_bios` functions check the `PM` signature (it must be `0x504d`), the carry flag (it must be 0 if `APM` supported) and the value of the `cx` register (if it's 0x02, the protected mode interface is supported). +After we went though the most essential hardware interfaces like CPU, I/O, memory map, keyboard, the next a couple of steps are to query the BIOS for additional information about the system. The information which kernel is going to gather is not strictly required for entering protected mode, but it provides useful details that later parts of the kernel may rely on. -Next, it calls `0x15` again, but with `ax = 0x5304` to disconnect the `APM` interface and connect the 32-bit protected mode interface. In the end, it fills `boot_params.apm_bios_info` with values obtained from the BIOS. +The following information is going to be collected: -Note that `query_apm_bios` will be executed only if the `CONFIG_APM` or `CONFIG_APM_MODULE` compile time flag was set in the configuration file: +- Information about [Intel SpeedStep](http://en.wikipedia.org/wiki/SpeedStep) +- Information about [Advanced Power Management](http://en.wikipedia.org/wiki/Advanced_Power_Management) +- Information about [Enhanced Disk Drive](https://en.wikipedia.org/wiki/INT_13H) +At this moment we will not dive into details about each of this query, but will get back to them in the next parts when we will use this information. For now, just let's take a short look at these functions: + + ```C + /* Query Intel SpeedStep (IST) information */ + query_ist(); + + /* Query APM information */ #if defined(CONFIG_APM) || defined(CONFIG_APM_MODULE) - query_apm_bios(); + query_apm_bios(); #endif -``` - -The last is the [`query_edd`](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/edd.c#L122) function, which queries `Enhanced Disk Drive` information from the BIOS. Let's look at how `query_edd` is implemented. - -First of all, it reads the [edd](https://github.com/torvalds/linux/blob/v4.16/Documentation/admin-guide/kernel-parameters.rst) option from the kernel's command line and if it was set to `off` then `query_edd` just returns. -If EDD is enabled, `query_edd` goes over BIOS-supported hard disks and queries EDD information in the following loop: - -```C -for (devno = 0x80; devno < 0x80+EDD_MBR_SIG_MAX; devno++) { - if (!get_edd_info(devno, &ei) && boot_params.eddbuf_entries < EDDMAXNR) { - memcpy(edp, &ei, sizeof ei); - edp++; - boot_params.eddbuf_entries++; - } - ... - ... - ... - } + /* Query EDD information */ +#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) + query_edd(); +#endif ``` -where `0x80` is the first hard drive and the value of the `EDD_MBR_SIG_MAX` macro is 16. It collects data into an array of [edd_info](https://github.com/torvalds/linux/blob/v4.16/include/uapi/linux/edd.h) structures. `get_edd_info` checks that EDD is present by invoking the `0x13` interrupt with `ah` as `0x41` and if EDD is present, `get_edd_info` again calls the `0x13` interrupt, but with `ah` as `0x48` and `si` containing the address of the buffer where EDD information will be stored. +The first one is getting information about the [Intel SpeedStep](http://en.wikipedia.org/wiki/SpeedStep). This information is obtained by the calling the `0x15` BIOS interrupt and store the result in the `boot_params` structure. The returned information describes the support of the Intel SpeedStep and settings around it. If it is supported, this information will be passed later by the kernel to the power management subsystems. + +The next one is getting information about the [Advanced Power Management](http://en.wikipedia.org/wiki/Advanced_Power_Management). The logic of this function is pretty similar to the one described above. It uses the same `0x15` BIOS interrupt to obtain information and store it in the `boot_params` structure. The returned information describes the support of the `APM` which was power management sub-system before [ACPI](https://en.wikipedia.org/wiki/ACPI) started to be a standard. -Conclusion --------------------------------------------------------------------------------- +The last one function gets information about the `Enhanced Disk Drive` from the BIOS. The same `0x13` BIOS interrupt is used to obtain this information. The returned information describes the disks and their characteristics like geometry and mapping information. -This is the end of the second part about the insides of the Linux kernel. In the next part, we will see video mode setting and the rest of the preparations before the transition to protected mode and directly transitioning into it. +## Conclusion -If you have any questions or suggestions write me a comment or ping me at [twitter](https://twitter.com/0xAX). +This is the end of the second part about Linux kernel insides. If you have questions or suggestions, feel free ping me on X - [0xAX](https://twitter.com/0xAX), drop me an [email](mailto:anotherworldofworld@gmail.com), or just create an [issue](https://github.com/0xAX/linux-insides/issues/new). In the next part, we will continue to deal with the preparations before transitioning into protected mode and the transitioning itself. -**Please note that English is not my first language, And I am really sorry for any inconvenience. If you find any mistakes please send me a PR to [linux-insides](https://github.com/0xAX/linux-internals).** +## Links -Links --------------------------------------------------------------------------------- +Here is the list of the links that you may find useful during reading of this chapter: -* [Protected mode](http://en.wikipedia.org/wiki/Protected_mode) -* [Protected mode](http://wiki.osdev.org/Protected_Mode) -* [Long mode](http://en.wikipedia.org/wiki/Long_mode) -* [Nice explanation of CPU Modes with code](http://www.codeproject.com/Articles/45788/The-Real-Protected-Long-mode-assembly-tutorial-for) -* [How to Use Expand Down Segments on Intel 386 and Later CPUs](http://www.sudleyplace.com/dpmione/expanddown.html) -* [earlyprintk documentation](https://github.com/torvalds/linux/blob/v4.16/Documentation/x86/earlyprintk.txt) -* [Kernel Parameters](https://github.com/torvalds/linux/blob/v4.16/Documentation/admin-guide/kernel-parameters.rst) -* [Serial console](https://github.com/torvalds/linux/blob/v4.16/Documentation/admin-guide/serial-console.rst) -* [Intel SpeedStep](http://en.wikipedia.org/wiki/SpeedStep) -* [APM](https://en.wikipedia.org/wiki/Advanced_Power_Management) -* [EDD specification](http://www.t13.org/documents/UploadedDocuments/docs2004/d1572r3-EDD3.pdf) -* [TLDP documentation for Linux Boot Process](http://www.tldp.org/HOWTO/Linux-i386-Boot-Code-HOWTO/setup.html) (old) -* [Previous Part](linux-bootstrap-1.md) +- [Protected mode](http://en.wikipedia.org/wiki/Protected_mode) +- [Long mode](http://en.wikipedia.org/wiki/Long_mode) +- [The kernel's command-line parameters](https://github.com/torvalds/linux/blob/master/Documentation/admin-guide/kernel-parameters.rst) +- [Linux serial console](https://github.com/torvalds/linux/blob/master/Documentation/admin-guide/serial-console.rst) +- [BIOS interrupt](https://en.wikipedia.org/wiki/BIOS_interrupt_call) +- [Intel SpeedStep](http://en.wikipedia.org/wiki/SpeedStep) +- [APM](https://en.wikipedia.org/wiki/Advanced_Power_Management) +- [EDD specification](http://www.t13.org/documents/UploadedDocuments/docs2004/d1572r3-EDD3.pdf) diff --git a/Booting/linux-bootstrap-3.md b/Booting/linux-bootstrap-3.md index 46aa3bca..28b07262 100644 --- a/Booting/linux-bootstrap-3.md +++ b/Booting/linux-bootstrap-3.md @@ -1,26 +1,69 @@ -Kernel booting process. Part 3. -================================================================================ +# Kernel booting process. Part 3. -Video mode initialization and transition to protected mode --------------------------------------------------------------------------------- +In the previous [part](./linux-bootstrap-2.md), we have seen first pieces of C code that run in the Linux kernel. One of the main goal of this stage is to switch into the [protected mode](https://en.wikipedia.org/wiki/Protected_mode), but before this, we have seen some early setup code which executes early initialization procedures, such as: -This is the third part of the `Kernel booting process` series. In the previous [part](linux-bootstrap-2.md#kernel-booting-process-part-2), we stopped right before the call to the `set_video` routine from [main.c](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/main.c). +- Setup of console to be able to print messages from the kernel's setup code +- Validation of CPU +- Detection of available memory +- Initialization of keyboard -In this part, we will look at: +In this part we will continue to explore the next steps before we will see the transition into the protected mode. -* Video mode initialization in the kernel setup code, -* the preparations made before switching into protected mode, -* the transition to protected mode +## Video mode setup -**NOTE** If you don't know anything about protected mode, you can find some information about it in the previous [part](linux-bootstrap-2.md#protected-mode). Also, there are a couple of [links](linux-bootstrap-2.md#links) which can help you. +Previously, we stopped right at the point where the kernel setup code was about to initialize the video mode. -As I wrote above, we will start from the `set_video` function which is defined in the [arch/x86/boot/video.c](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/video.c) source code file. We can see that it starts by first getting the video mode from the `boot_params.hdr` structure: +The setup code is located in the [arch/x86/boot/video.c](https://github.com/torvalds/linux/blob/master/arch/x86/boot/video.c) and implemented by the `set_video` function. Now let's take a look at the implementation of the `set_video` function: + + +```C +void set_video(void) +{ + u16 mode = boot_params.hdr.vid_mode; + + RESET_HEAP(); + + store_mode_params(); + save_screen(); + probe_cards(0); + + for (;;) { + if (mode == ASK_VGA) + mode = mode_menu(); + + if (!set_mode(mode)) + break; + + printf("Undefined video mode number: %x\n", mode); + mode = ASK_VGA; + } + boot_params.hdr.vid_mode = mode; + vesa_store_edid(); + store_mode_params(); + + if (do_restore) + restore_screen(); +} +``` + +Let's try to understand what this function does in the next sections. + +### Video modes + +The implementation of the `set_video` function starts by getting the video mode from the `boot_params.hdr` structure: ```C u16 mode = boot_params.hdr.vid_mode; ``` -which we filled in the `copy_boot_params` function (you can read about it in the previous post). `vid_mode` is an obligatory field which is filled by the bootloader. You can find information about it in the kernel `boot protocol`: +> [!NOTE] +> Instead of old good standard C data types like `int`, `short`, `unsigned short`, Linux kernel provides own data types for numeric values. Here is the table that will help you to remember them: +> +> | Type | char | short | int | long | u8 | u16 | u32 | u64 | +> |------|------|-------|-----|------|----|-----|-----|-----| +> | Size | 1 | 2 | 4 | 8 | 1 | 2 | 4 | 8 | + +The initial value of the video mode can be filled by the bootloader. This header field defined in the Linux kernel boot protocol: ``` Offset Proto Name Meaning @@ -28,7 +71,7 @@ Offset Proto Name Meaning 01FA/2 ALL vid_mode Video mode control ``` -As we can read from the linux kernel boot protocol: +Information about potential values for this field can be also found in the Linux kernel boot protocol document: ``` vga= @@ -40,60 +83,86 @@ vga= line is parsed. ``` -So we can add the `vga` option to the grub (or another bootloader's) configuration file and it will pass this option to the kernel command line. This option can have different values as mentioned in the description. For example, it can be an integer number `0xFFFD` or `ask`. If you pass `ask` to `vga`, you will see a menu like this: - -![video mode setup menu](images/video_mode_setup_menu.png) - -which will ask to select a video mode. We will look at its implementation, but before diving into the implementation we have to look at some other things. - -Kernel data types --------------------------------------------------------------------------------- - -Earlier we saw definitions of different data types like `u16` etc. in the kernel setup code. Let's look at a couple of data types provided by the kernel: - - -| Type | char | short | int | long | u8 | u16 | u32 | u64 | -|------|------|-------|-----|------|----|-----|-----|-----| -| Size | 1 | 2 | 4 | 8 | 1 | 2 | 4 | 8 | - -If you read the source code of the kernel, you'll see these very often and so it will be good to remember them. - -Heap API --------------------------------------------------------------------------------- - -After we get `vid_mode` from `boot_params.hdr` in the `set_video` function, we can see the call to the `RESET_HEAP` function. `RESET_HEAP` is a macro which is defined in [arch/x86/boot/boot.h](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/boot.h) header file. - -This macro is defined as: - +This tells us that we can add the `vga` option to the GRUB (or another bootloader's) configuration file and it will pass this option to the kernel command line. This option can have different values as mentioned in the description above. For example, it can be an integer number `0xFFFD` or `ask`. If you pass `ask` to `vga`, you will see a menu with the possible video modes. We can test it using [QEMU](https://www.qemu.org/) virtual machine: + +```bash +sudo qemu-system-x86_64 -kernel ./linux/arch/x86/boot/bzImage \ + -nographic \ + -append "console=ttyS0 nokaslr vga=ask" \ + -initrd /boot/initramfs-6.17.0-rc3-g1b237f190eb3.img +``` + +If you did everything correctly, after the kernel is loaded it will ask you to press the `ENTER`. By pressing on it you should see something like this: + +``` +Booting from ROM... +Probing EDD (edd=off to disable)... ok +Press to see video modes available, to continue, or wait 30 sec +Mode: Resolution: Type: Mode: Resolution: Type: Mode: Resolution: Type: +0 F00 80x25 VGA 1 F01 80x50 VGA 2 F02 80x43 VGA +3 F03 80x28 VGA 4 F05 80x30 VGA 5 F06 80x34 VGA +6 F07 80x60 VGA 7 340 320x200x32 VESA 8 341 640x400x32 VESA +9 342 640x480x32 VESA a 343 800x600x32 VESA b 344 1024x768x32 VESA +c 345 1280x1024x32 VESA d 347 1600x1200x32 VESA e 34C 1152x864x32 VESA +f 377 1280x768x32 VESA g 37A 1280x800x32 VESA h 37D 1280x960x32 VESA +i 380 1440x900x32 VESA j 383 1400x1050x32 VESA k 386 1680x1050x32 VESA +l 389 1920x1200x32 VESA m 38C 2560x1600x32 VESA n 38F 1280x720x32 VESA +o 392 1920x1080x32 VESA p 300 640x400x8 VESA q 301 640x480x8 VESA +r 303 800x600x8 VESA s 305 1024x768x8 VESA t 307 1280x1024x8 VESA +u 30D 320x200x15 VESA v 30E 320x200x16 VESA w 30F 320x200x24 VESA +x 310 640x480x15 VESA y 311 640x480x16 VESA z 312 640x480x24 VESA + 313 800x600x15 VESA 314 800x600x16 VESA 315 800x600x24 VESA + 316 1024x768x15 VESA 317 1024x768x16 VESA 318 1024x768x24 VESA + 319 1280x1024x15 VESA 31A 1280x1024x16 VESA 31B 1280x1024x24 VESA + 31C 1600x1200x8 VESA 31D 1600x1200x15 VESA 31E 1600x1200x16 VESA + 31F 1600x1200x24 VESA 346 320x200x8 VESA 348 1152x864x8 VESA + 349 1152x864x15 VESA 34A 1152x864x16 VESA 34B 1152x864x24 VESA + 375 1280x768x16 VESA 376 1280x768x24 VESA 378 1280x800x16 VESA + 379 1280x800x24 VESA 37B 1280x960x16 VESA 37C 1280x960x24 VESA + 37E 1440x900x16 VESA 37F 1440x900x24 VESA 381 1400x1050x16 VESA + 382 1400x1050x24 VESA 384 1680x1050x16 VESA 385 1680x1050x24 VESA + 387 1920x1200x16 VESA 388 1920x1200x24 VESA 38A 2560x1600x16 VESA + 38B 2560x1600x24 VESA 38D 1280x720x16 VESA 38E 1280x720x24 VESA + 390 1920x1080x16 VESA 391 1920x1080x24 VESA 393 1600x900x16 VESA + 394 1600x900x24 VESA 395 1600x900x32 VESA 396 2560x1440x16 VESA + 397 2560x1440x24 VESA 398 2560x1440x32 VESA 399 3840x2160x16 VESA + 200 40x25 VESA 201 40x25 VESA 202 80x25 VESA + 203 80x25 VESA 207 80x25 VESA 213 320x200x8 VESA +Enter a video mode or "scan" to scan for additional modes: +``` + +### Early heap API + +Before proceeding further to investigate what the `set_video` function does, it will be useful to take a look at the API for the management of the kernel's early heap. + +After getting the video mode set by the bootloader, we can see reseting the heap value by the `RESET_HEAP` macro. The definition of this macro is in the [arch/x86/boot/boot.h](https://github.com/torvalds/linux/blob/master/arch/x86/boot/boot.h): + + ```C #define RESET_HEAP() ((void *)( HEAP = _end )) ``` -If you have read the second part, you will remember that we initialized the heap with the [`init_heap`](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/main.c) function. We have a couple of utility macros and functions for managing the heap which are defined in `arch/x86/boot/boot.h` header file. +If you have read the [part](./linux-bootstrap-2.md#kernel-booting-process-part-2), you should remember that we have seen initialization of the heap memory area.The kernel setup code provides a couple of utility macros and functions for managing the early heap. Let's take a look at some of them, especially at ones which we will meet in this chapter. -They are: +The `RESET_HEAP` macro resets the heap by setting the `HEAP` variable to the `_end` which represents the end of the early setup kernel's `text` (or code) section. By doing this we just set the heap pointer to the very beginning of the heap. -```C -#define RESET_HEAP() -``` - -As we saw just above, it resets the heap by setting the `HEAP` variable to `_end`, where `_end` is just `extern char _end[];` - -Next is the `GET_HEAP` macro: +The next useful macro is: + ```C #define GET_HEAP(type, n) \ ((type *)__get_heap(sizeof(type),__alignof__(type),(n))) ``` -for heap allocation. It calls the internal function `__get_heap` with 3 parameters: +The goal of this macro is to allocate memory on the early heap. This macro calls the `__get_heap` function from the same header file with the following three parameters: -* the size of the datatype to be allocated for -* `__alignof__(type)` specifies how variables of this type are to be aligned -* `n` specifies how many items to allocate +- The size of the datatype to be allocated for +- Specifies how variables of this type are to be aligned +- How many items specified by the first parameter to allocate The implementation of `__get_heap` is: + ```C static inline char *__get_heap(size_t s, size_t a, size_t n) { @@ -106,64 +175,27 @@ static inline char *__get_heap(size_t s, size_t a, size_t n) } ``` -and we will further see its usage, something like: +Let's try to understand how the `__get_heap` function works. First of all we can see here that `HEAP` pointer is assigned to the [aligned](https://en.wikipedia.org/wiki/Data_structure_alignment) address of the memory. The address is aligned based on the size of data type for which we want to allocate memory. After we have got the initial aligned address, we just move the `HEAP` pointer by the requested size. -```C -saved.data = GET_HEAP(u16, saved.x * saved.y); -``` - -Let's try to understand how `__get_heap` works. We can see here that `HEAP` (which is equal to `_end` after `RESET_HEAP()`) is assigned the address of the aligned memory according to the `a` parameter. After this we save the memory address from `HEAP` to the `tmp` variable, move `HEAP` to the end of the allocated block and return `tmp` which is the start address of allocated memory. - -And the last function is: +The last but not least API of the early heap that we will see is the `heap_free` function which checks the availability of the given size of memory on the heap: + ```C static inline bool heap_free(size_t n) { - return (int)(heap_end - HEAP) >= (int)n; + return (int)(heap_end-HEAP) >= (int)n; } ``` -which subtracts value of the `HEAP` pointer from the `heap_end` (we calculated it in the previous [part](linux-bootstrap-2.md)) and returns 1 if there is enough memory available for `n`. - -That's all. Now we have a simple API for heap and can setup video mode. - -Set up video mode --------------------------------------------------------------------------------- +As you may see, the implementation of this function is pretty trivial. It just subtracts the current value of the heap pointer from the address which represents the end of heap memory area. The function returns `true` if there is enough memory for `n` or `false` otherwise. -Now we can move directly to video mode initialization. We stopped at the `RESET_HEAP()` call in the `set_video` function. Next is the call to `store_mode_params` which stores video mode parameters in the `boot_params.screen_info` structure which is defined in [include/uapi/linux/screen_info.h](https://github.com/torvalds/linux/blob/v4.16/include/uapi/linux/screen_info.h) header file. +### Return to the setup of the video mode -If we look at the `store_mode_params` function, we can see that it starts with a call to the `store_cursor_position` function. As you can understand from the function name, it gets information about the cursor and stores it. +Since the heap pointer is in the right place, we can move directly to video mode initialization. The next step after this is the call to `store_mode_params` function which stores currently available video mode parameters in the `boot_params.screen_info`. This structure defined in the [include/uapi/linux/screen_info.h](https://github.com/torvalds/linux/blob/master/include/uapi/linux/screen_info.hh) header file and provides basic information about the screen and video mode. Such as current position of the cursor, the BIOS video mode number that was set when the kernel was loaded, the number of text rows and columns and so on. The `store_mode_params` function asks the BIOS services about this information and stores it in this structure for later usage. -First of all, `store_cursor_position` initializes two variables which have type `biosregs` with `AH = 0x3`, and calls the `0x10` BIOS interruption. After the interruption is successfully executed, it returns row and column in the `DL` and `DH` registers. Row and column will be stored in the `orig_x` and `orig_y` fields of the `boot_params.screen_info` structure. - -After `store_cursor_position` is executed, the `store_video_mode` function will be called. It just gets the current video mode and stores it in `boot_params.screen_info.orig_video_mode`. - -After this, `store_mode_params` checks the current video mode and sets the `video_segment`. After the BIOS transfers control to the boot sector, the following addresses are for video memory: - -``` -0xB000:0x0000 32 Kb Monochrome Text Video Memory -0xB800:0x0000 32 Kb Color Text Video Memory -``` - -So we set the `video_segment` variable to `0xb000` if the current video mode is MDA, HGC, or VGA in monochrome mode and to `0xb800` if the current video mode is in color mode. After setting up the address of the video segment, the font size needs to be stored in `boot_params.screen_info.orig_video_points` with: - -```C -set_fs(0); -font_size = rdfs16(0x485); -boot_params.screen_info.orig_video_points = font_size; -``` - -First of all, we put 0 in the `FS` register with the `set_fs` function. We already saw functions like `set_fs` in the previous part. They are all defined in [arch/x86/boot/boot.h](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/boot.h). Next, we read the value which is located at address `0x485` (this memory location is used to get the font size) and save the font size in `boot_params.screen_info.orig_video_points`. - -```C -x = rdfs16(0x44a); -y = (adapter == ADAPTER_CGA) ? 25 : rdfs8(0x484)+1; -``` - -Next, we get the amount of columns by address `0x44a` and rows by address `0x484` and store them in `boot_params.screen_info.orig_video_cols` and `boot_params.screen_info.orig_video_lines`. After this, execution of `store_mode_params` is finished. - -Next we can see the `save_screen` function which just saves the contents of the screen to the heap. This function collects all the data which we got in the previous functions (like the rows and columns, and stuff) and stores it in the `saved_screen` structure, which is defined as: +The next step is save the current contents of the screen to the heap by calling the `save_screen` function. This function collects all the data which we got in the previous functions (like the rows and columns, and stuff) and stores it in the `saved_screen` structure, which is defined as: + ```C static struct saved_screen { int x, y; @@ -172,25 +204,25 @@ static struct saved_screen { } saved; ``` -It then checks whether the heap has free space for it with: +After the contents of the screen is saved, the next step is to collect currently available video modes in the system. This job is done by the `probe_cards` function defined in the [arch/x86/boot/video-mode.c](https://github.com/torvalds/linux/blob/master/arch/x86/boot/video-mode.c). It goes over all `video_cards` and collects the information about them: ```C -if (!heap_free(saved.x*saved.y*sizeof(u16)+512)) - return; +for (card = video_cards; card < video_cards_end; card++) { + /* collecting the number of video modes */ +} ``` -and allocates space in the heap if it is enough and stores `saved_screen` in it. - -The next call is `probe_cards(0)` from [arch/x86/boot/video-mode.c](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/video-mode.c) source code file. It goes over all video_cards and collects the number of modes provided by the cards. Here is the interesting part, we can see the loop: +The `video_cards` is an array defined as: + ```C -for (card = video_cards; card < video_cards_end; card++) { - /* collecting number of modes here */ -} +#define __videocard struct card_info __section(".videocards") __attribute__((used)) +extern struct card_info video_cards[], video_cards_end[]; ``` -but `video_cards` is not declared anywhere. The answer is simple: every video mode presented in the x86 kernel setup code has a definition that looks like this: +The `__videocard` macro allows to define structures which describe video cards and the linker will put them into the `video_cards` array. Example of such structure can be found in the [arch/x86/boot/video-vga.c](https://github.com/torvalds/linux/blob/master/arch/x86/boot/video-vga.c): + ```C static __videocard video_vga = { .card_name = "VGA", @@ -199,50 +231,18 @@ static __videocard video_vga = { }; ``` -where `__videocard` is a macro: - -```C -#define __videocard struct card_info __attribute__((used,section(".videocards"))) -``` - -which means that the `card_info` structure: - -```C -struct card_info { - const char *card_name; - int (*set_mode)(struct mode_info *mode); - int (*probe)(void); - struct mode_info *modes; - int nmodes; - int unsafe; - u16 xmode_first; - u16 xmode_n; -}; -``` - -is in the `.videocards` segment. Let's look in the [arch/x86/boot/setup.ld](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/setup.ld) linker script, where we can find: - -``` - .videocards : { - video_cards = .; - *(.videocards) - video_cards_end = .; - } -``` +After the `probe_cards` function executes we have a bunch of structures in our `video_cards` array and the known number of video modes they provide. At the next step the kernel setup code will print menu with available video modes if the `vid_mode=ask` option was passed to the kernel command line and set up the video mode having all the parameters that we have gathered at the previous steps. The video mode is set by the `set_mode` function is defined in [video-mode.c](https://github.com/torvalds/linux/blob/master/arch/x86/boot/video-mode.c). This function expects one parameter - the video mode identifier. This identifier is set by the bootloader or set based on the choice of the video modes menu. The `set_mode` function goes over all available video cards defined in the `video_cards` array and if the given mode belongs to the given card, the `card->set_mode()` callback is called to setup the video mode. -It means that `video_cards` is just a memory address and all `card_info` structures are placed in this segment. It means that all `card_info` structures are placed between `video_cards` and `video_cards_end`, so we can use a loop to go over all of it. After `probe_cards` executes we have a bunch of structures like `static __videocard video_vga` with the `nmodes` (the number of video modes) filled in. - -After the `probe_cards` function is done, we move to the main loop in the `set_video` function. There is an infinite loop which tries to set up the video mode with the `set_mode` function or prints a menu if we passed `vid_mode=ask` to the kernel command line or if video mode is undefined. - -The `set_mode` function is defined in [video-mode.c](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/video-mode.c) and gets only one parameter, `mode`, which is the number of video modes (we got this value from the menu or in the start of `setup_video`, from the kernel setup header). - -The `set_mode` function checks the `mode` and calls the `raw_set_mode` function. The `raw_set_mode` calls the selected card's `set_mode` function, i.e. `card->set_mode(struct mode_info*)`. We can get access to this function from the `card_info` structure. Every video mode defines this structure with values filled depending upon the video mode (for example for `vga` it is the `video_vga.set_mode` function. See the above example of the `card_info` structure for `vga`). `video_vga.set_mode` is `vga_set_mode`, which checks the vga mode and calls the respective function: +Let's take a look at the example of setting up [VGA](https://en.wikipedia.org/wiki/Video_Graphics_Array) video mode: + ```C static int vga_set_mode(struct mode_info *mode) { + /* Set the basic mode */ vga_set_basic_mode(); + /* Override a possibly broken BIOS */ force_x = mode->x; force_y = mode->y; @@ -268,53 +268,60 @@ static int vga_set_mode(struct mode_info *mode) vga_set_80x60(); break; } + return 0; } ``` -Every function which sets up video mode just calls the `0x10` BIOS interrupt with a certain value in the `AH` register. +The `vga_set_mode` function is responsible for configuring the VGA display to a specific text mode, based on the settings which we collected in the previous steps. The `vga_set_basic_mode` function resets the VGA hardware into a standard text mode. The next statement sets up the video mode based on the video mode that was selected. Most of these functions have very similar implementation based on the `0x10` BIOS interrupt. -After we have set the video mode, we pass it to `boot_params.hdr.vid_mode`. +After this step, the video mode is configured and we save all the information about it again for later use. Having done this, the video mode setup is complete and now we can take a look at the last preparation before we will see the switch into the protected mode. -Next, `vesa_store_edid` is called. This function simply stores the [EDID](https://en.wikipedia.org/wiki/Extended_Display_Identification_Data) (**E**xtended **D**isplay **I**dentification **D**ata) information for kernel use. After this `store_mode_params` is called again. Lastly, if `do_restore` is set, the screen is restored to an earlier state. +## Last preparation before transition into protected mode -Having done this, the video mode setup is complete and now we can switch to the protected mode. +Returning to the [`main`](https://github.com/torvalds/linux/blob/master/arch/x86/boot/main.c) function of the early kernel setup code, we finally can see: -Last preparation before transition into protected mode --------------------------------------------------------------------------------- + +```C + /* Do the last things and invoke protected mode */ + go_to_protected_mode(); +``` -We can see the last function call - `go_to_protected_mode` - in [arch/x86/boot/main.c](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/main.c). As the comment says: `Do the last things and invoke protected mode`, so let's see what these last things are and switch into protected mode. +As the comment says: `Do the last things and invoke protected mode`, so let's see what these last things are and switch into protected mode. -The `go_to_protected_mode` function is defined in [arch/x86/boot/pm.c](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/pm.c). It contains some functions which make the last preparations before we can jump into protected mode, so let's look at it and try to understand what it does and how it works. +The `go_to_protected_mode` function is defined in [arch/x86/boot/pm.c](https://github.com/torvalds/linux/blob/master/arch/x86/boot/pm.c). It contains some routines which make the last preparations before we can jump into protected mode, so let's look at it and try to understand what it does and how it works. -First is the call to the `realmode_switch_hook` function in `go_to_protected_mode`. This function invokes the real mode switch hook if it is present and disables [NMI](http://en.wikipedia.org/wiki/Non-maskable_interrupt). Hooks are used if the bootloader runs in a hostile environment. You can read more about hooks in the [boot protocol](https://www.kernel.org/doc/Documentation/x86/boot.txt) (see **ADVANCED BOOT LOADER HOOKS**). +The very first function that we may see in the `go_to_protected_mode` is the `realmode_switch_hook` function. This function invokes the real mode switch hook if it is present or disables [NMI](http://en.wikipedia.org/wiki/Non-maskable_interrupt) otherwise. The hooks are used if the bootloader runs in a hostile environment. You can read more about hooks in the [boot protocol](https://www.kernel.org/doc/Documentation/x86/boot.txt) (see **ADVANCED BOOT LOADER HOOKS**). Interrupts must be disabled before switching to protected mode because otherwise the CPU could receive an interrupt when there is no valid interrupt table or handlers. Once the kernel will set up the protected-mode interrupt infrastructure, interrupts will be disabled again. -The `realmode_switch` hook presents a pointer to the 16-bit real mode far subroutine which disables non-maskable interrupts. After the `realmode_switch` hook (it isn't present for me) is checked, Non-Maskable Interrupts(NMI) is disabled: +We will consider only more-less standard use case, when the bootloader does not provide any hooks. So we just disable non-maskable interrupts: + ```assembly -asm volatile("cli"); -outb(0x80, 0x70); /* Disable NMI */ -io_delay(); + asm volatile("cli"); + outb(0x80, 0x70); /* Disable NMI */ + io_delay(); ``` -At first, there is an inline assembly statement with a `cli` instruction which clears the interrupt flag (`IF`). After this, external interrupts are disabled. The next line disables NMI (non-maskable interrupt). - -An interrupt is a signal to the CPU which is emitted by hardware or software. After getting such a signal, the CPU suspends the current instruction sequence, saves its state and transfers control to the interrupt handler. After the interrupt handler has finished its work, it transfers control back to the interrupted instruction. Non-maskable interrupts (NMI) are interrupts which are always processed, independently of permission. They cannot be ignored and are typically used to signal for non-recoverable hardware errors. We will not dive into the details of interrupts now but we will be discussing them in the coming posts. +At the first line, there is an [inline assembly](../Theory/linux-theory-3.md) statement with the `cli` instruction which clears the [interrupt flag](https://en.wikipedia.org/wiki/Interrupt_flag). After this, external interrupts are disabled. The next line disables NMI (non-maskable interrupt). An interrupt is a signal to the CPU which is emitted by hardware or software. After getting such a signal, the CPU suspends the current instruction sequence, saves its state and transfers control to the interrupt handler. After the interrupt handler has finished its work, it transfers control back to the interrupted instruction. Non-maskable interrupts (NMI) are interrupts which are always processed, independently of permission. They cannot be ignored and are typically used to signal for non-recoverable hardware errors. We will not dive into the details of interrupts now but we will be discussing them in the next posts. -Let's get back to the code. We can see in the second line that we are writing the byte `0x80` (disabled bit) to `0x70` (the CMOS Address register). After that, a call to the `io_delay` function occurs. `io_delay` causes a small delay and looks like: +Let's get back to the code. We can see in the second line that we are writing the byte `0x0` to the port `0x80`. After that, a call to the `io_delay` function occurs. `io_delay` causes a small delay and looks like: + ```C static inline void io_delay(void) { const u16 DELAY_PORT = 0x80; - asm volatile("outb %%al,%0" : : "dN" (DELAY_PORT)); + outb(0, DELAY_PORT); } ``` -To output any byte to the port `0x80` should delay exactly 1 microsecond. So we can write any value (the value from `AL` in our case) to the `0x80` port. After this delay the `realmode_switch_hook` function has finished execution and we can move to the next function. +To output any byte to the port `0x80` should delay exactly 1 microsecond. This delay is needed to be sure that the change of the NMI mask has fully taken effect. After this delay, the `realmode_switch_hook` function has finished execution and we can be sure that all interrupts are disabled. -The next function is `enable_a20`, which enables the [A20 line](http://en.wikipedia.org/wiki/A20_line). This function is defined in [arch/x86/boot/a20.c](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/a20.c) and it tries to enable the A20 gate with different methods. The first is the `a20_test_short` function which checks if A20 is already enabled or not with the `a20_test` function: +The next step is the `enable_a20` function, which enables the [A20 line](http://en.wikipedia.org/wiki/A20_line). Enabling of this line allows kernel to have access above 1 MB. +The `enable_a20` function is defined in [arch/x86/boot/a20.c](https://github.com/torvalds/linux/blob/master/arch/x86/boot/a20.c) and tries to enable the `A20` gate using the different approaches. The first is the `a20_test_short` function which checks if `A20` is already enabled or not using the `a20_test` function: + + ```C static int a20_test(int loops) { @@ -339,46 +346,60 @@ static int a20_test(int loops) } ``` -First of all, we put `0x0000` in the `FS` register and `0xffff` in the `GS` register. Next, we read the value at the address `A20_TEST_ADDR` (it is `0x200`) and put this value into the variables `saved` and `ctr`. - -Next, we write an updated `ctr` value into `fs:A20_TEST_ADDR` or `fs:0x200` with the `wrfs32` function, then delay for 1ms, and then read the value from the `GS` register into the address `A20_TEST_ADDR+0x10`. In a case when `a20` line is disabled, the address will be overlapped, in other case if it's not zero `a20` line is already enabled the A20 line. +To verify whether the `A20` line is already enabled or not, the kernel performs a simple memory test. It begins by setting the `FS` register to `0x0000` and the `GS` register to `0xffff` values. By doing this, an access to `FS:0x200` (`A20_TEST_ADDR`) points into the very beginning of memory, while an access to `GS:0x2010` refers to a location just past the one-megabyte boundary. If the `A20` line is disabled, the latter will wrap around and point to the same physical address. -If A20 is disabled, we try to enable it with a different method which you can find in `a20.c`. For example, it can be done with a call to the `0x15` BIOS interrupt with `AH=0x2041`. +If the `A20` gate is disabled, the kernel will try to enable it using different methods which you can find in `enable_a20` function. For example, it can be done with a call to the `0x15` BIOS interrupt with `AH` register set to `0x2041`. If this function finished with a failure, print an error message and call the function `die` which will stop the process of the kernel setup. -If the `enable_a20` function finished with a failure, print an error message and call the function `die`. You can remember it from the first source code file where we started - [arch/x86/boot/header.S](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/header.S): +After the `A20` gate is successfully enabled, the `reset_coprocessor` function is called: -```assembly -die: - hlt - jmp die - .size die, .-die + +```C +static void reset_coprocessor(void) +{ + outb(0, 0xf0); + io_delay(); + outb(0, 0xf1); + io_delay(); +} ``` -After the A20 gate is successfully enabled, the `reset_coprocessor` function is called: +This function resets the [math coprocessor](https://en.wikipedia.org/wiki/Floating-point_unit) to be sure it is in a clean state by writing `0` to `0xF0` and then resets it by writing `0` to `0xF1`. + +The next step is the `mask_all_interrupts` function: + ```C -outb(0, 0xf0); -outb(0, 0xf1); +static void mask_all_interrupts(void) +{ + outb(0xff, 0xa1); /* Mask all interrupts on the secondary PIC */ + io_delay(); + outb(0xfb, 0x21); /* Mask all but cascade on the primary PIC */ + io_delay(); +} ``` -This function clears the Math Coprocessor by writing `0` to `0xf0` and then resets it by writing `0` to `0xf1`. +This function masks or in other words forbids all interrupts on the primary and secondary [PICs](https://en.wikipedia.org/wiki/Programmable_interrupt_controller). This is needed for safeness, we forbid all the interrupts from the `PIC` so nothing can interrupt the CPU while the kernel is doing transition into protected mode. -After this, the `mask_all_interrupts` function is called: +All the operations before this point, were executed for safe transition to the protected mode. The next operations will prepare the transition to the protected mode. Let's take a look at them. -```C -outb(0xff, 0xa1); /* Mask all interrupts on the secondary PIC */ -outb(0xfb, 0x21); /* Mask all but cascade on the primary PIC */ -``` +## Entering Protected Mode + +At this point, we are very close to see the switching into protected mode of the Linux kernel. -This masks all interrupts on the secondary PIC (Programmable Interrupt Controller) and primary PIC except for IRQ2 on the primary PIC. +Only two steps remain: -And after all of these preparations, we can see the actual transition into protected mode. +- Setting up the Interrupt Descriptor Table +- Setting up the Global Descriptor Table -Set up the Interrupt Descriptor Table --------------------------------------------------------------------------------- +And that’s all! Once these two structures will be configured, the Linux kernel can make the jump into protected mode. -Now we set up the Interrupt Descriptor table (IDT) in the `setup_idt` function: +### Set up the Interrupt Descriptor Table +Before the CPU can safely enter protected mode, it needs to know where to find the handlers that will be triggered in a case of [interrupts and exceptions](https://en.wikipedia.org/wiki/Interrupt). In real mode, the CPU relies on the [Interrupt Vector Table](https://en.wikipedia.org/wiki/Interrupt_vector_table). In the protected mode this mechanism changes to the Interrupt Descriptor Table. + +This is a special structure located in memory which contains descriptors that describes where CPU can find handlers for interrupts and exceptions. The full description of Interrupt Description Table and its entries we will see later, because for now we anyway disabled all the interrupts at the previous steps. Let's take a look at the function which setups zero filled Interrupt Descriptor Table: + + ```C static void setup_idt(void) { @@ -387,8 +408,9 @@ static void setup_idt(void) } ``` -which sets up the Interrupt Descriptor Table (describes interrupt handlers and etc.). For now, the IDT is not installed (we will see it later), but now we just load the IDT with the `lidtl` instruction. `null_idt` contains the address and size of the IDT, but for now they are just zero. `null_idt` is a `gdt_ptr` structure, it is defined as: +As we may see, it just load the IDT which is filled with zero using the `lidtl` instruction. The `null_idt` has type `gdt_ptr` which is structure defined in the same source code file: + ```C struct gdt_ptr { u16 len; @@ -396,239 +418,172 @@ struct gdt_ptr { } __attribute__((packed)); ``` -where we can see the 16-bit length(`len`) of the IDT and the 32-bit pointer to it (More details about the IDT and interruptions will be seen in the next posts). ` __attribute__((packed))` means that the size of `gdt_ptr` is the minimum required size. So the size of the `gdt_ptr` will be 6 bytes here or 48 bits. (Next we will load the pointer to the `gdt_ptr` to the `GDTR` register and you might remember from the previous post that it is 48-bits in size). +This structure provides information about the pointer to the Interrupt Descriptor Table. -Set up Global Descriptor Table --------------------------------------------------------------------------------- +### Set up Global Descriptor Table -Next is the setup of the Global Descriptor Table (GDT). We can see the `setup_gdt` function which sets up the GDT (you can read about it in the post [Kernel booting process. Part 2.](linux-bootstrap-2.md#protected-mode)). There is a definition of the `boot_gdt` array in this function, which contains the definition of the three segments: +The next is the setup of the Global Descriptor Table. As you may remember, the memory access is based on `segment:offset` addressing in real mode. The protected mode introduces the different model based on the `Global Descriptor Table`. If you forgot the details about the Global Description Table structure, you can find more information in the [previous chapter](./linux-bootstrap-2.md#protected-mode). Instead of fixed segment bases and limits, the CPU now looks for memory regions defined by descriptors located in the Global Descriptor Table. The goal of kernel is to setup these descriptors. -```C -static const u64 boot_gdt[] __attribute__((aligned(16))) = { - [GDT_ENTRY_BOOT_CS] = GDT_ENTRY(0xc09b, 0, 0xfffff), - [GDT_ENTRY_BOOT_DS] = GDT_ENTRY(0xc093, 0, 0xfffff), - [GDT_ENTRY_BOOT_TSS] = GDT_ENTRY(0x0089, 4096, 103), -}; -``` - -for code, data and TSS (Task State Segment). We will not use the task state segment for now, it was added there to make Intel VT happy as we can see in the comment line (if you're interested you can find the commit which describes it - [here](https://github.com/torvalds/linux/commit/88089519f302f1296b4739be45699f06f728ec31)). Let's look at `boot_gdt`. First of all note that it has the `__attribute__((aligned(16)))` attribute. It means that this structure will be aligned by 16 bytes. - -Let's look at a simple example: +All the job will be done by the `setup_gdt` function which is defined in the same source code file. Let's take a look at the definition of this function: + ```C -#include - -struct aligned { - int a; -}__attribute__((aligned(16))); - -struct nonaligned { - int b; -}; - -int main(void) +static void setup_gdt(void) { - struct aligned a; - struct nonaligned na; - - printf("Not aligned - %zu \n", sizeof(na)); - printf("Aligned - %zu \n", sizeof(a)); - - return 0; + /* There are machines which are known to not boot with the GDT + being 8-byte unaligned. Intel recommends 16 byte alignment. */ + static const u64 boot_gdt[] __attribute__((aligned(16))) = { + /* CS: code, read/execute, 4 GB, base 0 */ + [GDT_ENTRY_BOOT_CS] = GDT_ENTRY(DESC_CODE32, 0, 0xfffff), + /* DS: data, read/write, 4 GB, base 0 */ + [GDT_ENTRY_BOOT_DS] = GDT_ENTRY(DESC_DATA32, 0, 0xfffff), + /* TSS: 32-bit tss, 104 bytes, base 4096 */ + /* We only have a TSS here to keep Intel VT happy; + we don't actually use it for anything. */ + [GDT_ENTRY_BOOT_TSS] = GDT_ENTRY(DESC_TSS32, 4096, 103), + }; + /* Xen HVM incorrectly stores a pointer to the gdt_ptr, instead + of the gdt_ptr contents. Thus, make it static so it will + stay in memory, at least long enough that we switch to the + proper kernel GDT. */ + static struct gdt_ptr gdt; + + gdt.len = sizeof(boot_gdt)-1; + gdt.ptr = (u32)&boot_gdt + (ds() << 4); + + asm volatile("lgdtl %0" : : "m" (gdt)); } ``` -Technically a structure which contains one `int` field must be 4 bytes in size, but an `aligned` structure will need 16 bytes to store in memory: - -``` -$ gcc test.c -o test && test -Not aligned - 4 -Aligned - 16 -``` - -The `GDT_ENTRY_BOOT_CS` has index - 2 here, `GDT_ENTRY_BOOT_DS` is `GDT_ENTRY_BOOT_CS + 1` and etc. It starts from 2, because the first is a mandatory null descriptor (index - 0) and the second is not used (index - 1). - -`GDT_ENTRY` is a macro which takes flags, base, limit and builds a GDT entry. For example, let's look at the code segment entry. `GDT_ENTRY` takes the following values: - -* base - 0 -* limit - 0xfffff -* flags - 0xc09b - -What does this mean? The segment's base address is 0, and the limit (size of segment) is - `0xfffff` (1 MB). Let's look at the flags. It is `0xc09b` and it will be: - -``` -1100 0000 1001 1011 -``` - -in binary. Let's try to understand what every bit means. We will go through all bits from left to right: +The initial memory descriptors specified by the items of the `boot_gdt` array. The `setup_gdt` function just loads the pointer to the Global Descriptor Table filled with these items using the `lgdtl` instruction. Let's take a closer look at the memory descriptors definition. -* 1 - (G) granularity bit -* 1 - (D) if 0 16-bit segment; 1 = 32-bit segment -* 0 - (L) executed in 64-bit mode if 1 -* 0 - (AVL) available for use by system software -* 0000 - 4-bit length 19:16 bits in the descriptor -* 1 - (P) segment presence in memory -* 00 - (DPL) - privilege level, 0 is the highest privilege -* 1 - (S) code or data segment, not a system segment -* 101 - segment type execute/read/ -* 1 - accessed bit +Initially, the 3 memory descriptors specified: -You can read more about every bit in the previous [post](linux-bootstrap-2.md) or in the [Intel® 64 and IA-32 Architectures Software Developer's Manuals 3A](http://www.intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html). +- Code segment +- Memory segment +- Task state segment -After this we get the length of the GDT with: - -```C -gdt.len = sizeof(boot_gdt)-1; -``` +We will skip the description of the task state segment for now as it was added there to make [Intel VT](https://en.wikipedia.org/wiki/X86_virtualization#Intel_virtualization_(VT-x)) happy. The other two segments belongs to the memory for kernel code and data sections. Both memory descriptors defined using the `GDT_ENTRY` macro. This macro defined in the [arch/x86/include/asm/segment.h](https://github.com/torvalds/linux/blob/master/arch/x86/include/asm/segment.h) and expects to get three arguments: -We get the size of `boot_gdt` and subtract 1 (the last valid address in the GDT). +- `flags` +- `base` +- `limit` -Next we get a pointer to the GDT with: +Let's take a look at the definition of the code memory segment: ```C -gdt.ptr = (u32)&boot_gdt + (ds() << 4); +[GDT_ENTRY_BOOT_CS] = GDT_ENTRY(DESC_CODE32, 0, 0xfffff), ``` -Here we just get the address of `boot_gdt` and add it to the address of the data segment left-shifted by 4 bits (remember we're in real mode now). +The base address of this memory segment is defined as `0` and limit as `0xFFFFF` or 1 Megabyte. The `DESC_CODE32` describes the flags of this segment. If we take a look at the flags, we will see that granularity (bit `G`) of this segment is set to 4 KB units. This means that the segment covers addresses `0x00000000–0xFFFFFFFF` - entire 4 GB linear address space. The same base address and limit will be defined for the data segment. It is done this way because Linux kernel using so-called [flat memory model](https://en.wikipedia.org/wiki/Flat_memory_model). -Lastly we execute the `lgdtl` instruction to load the GDT into the GDTR register: +Besides the granularity bit, the `DESC_CODE32` specifies other flags. Among them you can find, the this a 32-bit segment which is readable, executable and present in memory. The privilege level is set to the highest value as kernel needs. -```C -asm volatile("lgdtl %0" : : "m" (gdt)); -``` +Looking at the documentation of the Global Descriptor Table and its entries you can check all the initial segments by yourself. It is not so hard. -Actual transition into protected mode --------------------------------------------------------------------------------- +## Transition into protected mode -This is the end of the `go_to_protected_mode` function. We loaded the IDT and GDT, disabled interrupts and now can switch the CPU into protected mode. The last step is calling the `protected_mode_jump` function with two parameters: +We are standing right before it. Interrupts are disabled, the Interrupt Descriptor Table and Global Descriptor Table are initialized. Finally, the kernel can execute jump into protected mode. But despite good news, we need to return to assembly again 😅 -```C -protected_mode_jump(boot_params.hdr.code32_start, (u32)&boot_params + (ds() << 4)); -``` - -which is defined in [arch/x86/boot/pmjump.S](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/pmjump.S). - -It takes two parameters: - -* address of the protected mode entry point -* address of `boot_params` - -Let's look inside `protected_mode_jump`. As I wrote above, you can find it in `arch/x86/boot/pmjump.S`. The first parameter will be in the `eax` register and the second one is in `edx`. - -First of all, we put the address of `boot_params` in the `esi` register and the address of the code segment register `cs` in `bx`. +The transition to the protected mode we can find in the [arch/x86/boot/pmjump.S](https://github.com/torvalds/linux/blob/master/arch/x86/boot/pmjump.S). Let's take a look at it: + ```assembly -GLOBAL(protected_mode_jump) - movl %edx, %esi # Pointer to boot_params table +SYM_FUNC_START_NOALIGN(protected_mode_jump) + movl %edx, %esi # Pointer to boot_params table xorl %ebx, %ebx movw %cs, %bx -``` - -After this, we shift `bx` by 4 bits and add it to the memory location labeled `2` (which is `(cs << 4) + in_pm32`, the physical address to jump after transitioned to 32-bit mode) and jump to label `1`. - -```assembly shll $4, %ebx - addl %ebx, 2f # Add %ebx to the value stored at label 2 - jmp 1f # Short jump to serialize on 386/486 -``` - -So after this `in_pm32` in label `2` will be overwritten with `(cs << 4) + in_pm32`. - -Next we put the data segment and the task state segment in the `cx` and `di` registers with: + addl %ebx, 2f + jmp 1f # Short jump to serialize on 386/486 +1: -```assembly movw $__BOOT_DS, %cx movw $__BOOT_TSS, %di -``` -As you can read above `GDT_ENTRY_BOOT_CS` has index 2 and every GDT entry is 8 byte, so `CS` will be `2 * 8 = 16`, `__BOOT_DS` is 24 etc. - -Next, we set the `PE` (Protection Enable) bit in the `CR0` control register: - -```assembly movl %cr0, %edx - orb $X86_CR0_PE, %dl + orb $X86_CR0_PE, %dl # Protected mode movl %edx, %cr0 ``` -and make a long jump to protected mode: +First of all, we preserve the address of `boot_params` structure in the `esi` register. After this, we compute the real-mode segment base of the current code and add it to the value pointed to by the `2f` label which is the entry point to the protected mode. This is needed because as you remember at the previous step, the code memory segment starts from `0`, so the jump instruction must contain absolute linear address of the entry point. + +At the next steps we save the segment addresses of the data and task state in general purpose registers `cx` and `di` and set the `PE` bit in the control `cr0` register. From this point, the protected mode is turned on, and we need just to jump into it, to set proper value of the code segment: + ```assembly - .byte 0x66, 0xea -2: .long in_pm32 - .word __BOOT_CS + # Transition to 32-bit mode + .byte 0x66, 0xea # ljmpl opcode +2: .long .Lin_pm32 # offset + .word __BOOT_CS # segment ``` -where: - -* `0x66` is the operand-size prefix which allows us to mix 16-bit and 32-bit code -* `0xea` - is the jump opcode -* `in_pm32` is the segment offset under protect mode, which has value `(cs << 4) + in_pm32` derived from real mode -* `__BOOT_CS` is the code segment we want to jump to. - -After this we are finally in protected mode: +The kernel is in protected mode now 🥳🥳🥳 + ```assembly -.code32 -.section ".text32","ax" + .code32 + .section ".text32","ax" +SYM_FUNC_START_LOCAL_NOALIGN(.Lin_pm32) ``` -Let's look at the first steps taken in protected mode. First of all we set up the data segment with: +Let's look at the first steps taken in the protected mode. First of all we set up the data segment with the data segment address that we preserved in the `cx` register at the previous step: + ```assembly -movl %ecx, %ds -movl %ecx, %es -movl %ecx, %fs -movl %ecx, %gs -movl %ecx, %ss + # Set up data segments for flat 32-bit mode + movl %ecx, %ds + movl %ecx, %es + movl %ecx, %fs + movl %ecx, %gs + movl %ecx, %ss ``` -If you paid attention, you can remember that we saved `$__BOOT_DS` in the `cx` register. Now we fill it with all segment registers besides `cs` (`cs` is already `__BOOT_CS`). - -And setup a valid stack for debugging purposes: +Since we are in the protected mode, our segment bases point to zero. Because of this, the stack pointer will point somewhere below the code, so we need to adjust it, at least for debugging purposes: + ```assembly -addl %ebx, %esp + addl %ebx, %esp ``` -The last step before the jump into 32-bit entry point is to clear the general purpose registers: +The last step before the jump into actual 32-bit entry point is to clear the general purpose registers: + ```assembly -xorl %ecx, %ecx -xorl %edx, %edx -xorl %ebx, %ebx -xorl %ebp, %ebp -xorl %edi, %edi + xorl %ecx, %ecx + xorl %edx, %edx + xorl %ebx, %ebx + xorl %ebp, %ebp + xorl %edi, %edi ``` -And jump to the 32-bit entry point in the end: +Now everything is ready. The kernel is in the protected mode and we can jump to the next code, address of which was passed in the `eax` register: + +```assembly + jmpl *%eax # Jump to the 32-bit entrypoint ``` -jmpl *%eax -``` - -Remember that `eax` contains the address of the 32-bit entry (we passed it as the first parameter into `protected_mode_jump`). - -That's all. We're in protected mode and stop at its entry point. We will see what happens next in the next part. - -Conclusion --------------------------------------------------------------------------------- -This is the end of the third part about linux kernel insides. In the next part, we will look at the first steps we take in protected mode and transition into [long mode](http://en.wikipedia.org/wiki/Long_mode). +## Conclusion -If you have any questions or suggestions write me a comment or ping me at [twitter](https://twitter.com/0xAX). +This is the end of the third part about Linux kernel insides. If you have questions or suggestions, feel free ping me on X - [0xAX](https://twitter.com/0xAX), drop me an [email](mailto:anotherworldofworld@gmail.com), or just create an [issue](https://github.com/0xAX/linux-insides/issues/new). -**Please note that English is not my first language, And I am really sorry for any inconvenience. If you find any mistakes, please send me a PR with corrections at [linux-insides](https://github.com/0xAX/linux-internals).** +## Links -Links --------------------------------------------------------------------------------- +Here is the list of the links that you may find useful during reading of this chapter: -* [VGA](http://en.wikipedia.org/wiki/Video_Graphics_Array) -* [VESA BIOS Extensions](http://en.wikipedia.org/wiki/VESA_BIOS_Extensions) -* [Data structure alignment](http://en.wikipedia.org/wiki/Data_structure_alignment) -* [Non-maskable interrupt](http://en.wikipedia.org/wiki/Non-maskable_interrupt) -* [A20](http://en.wikipedia.org/wiki/A20_line) -* [GCC designated inits](https://gcc.gnu.org/onlinedocs/gcc-4.1.2/gcc/Designated-Inits.html) -* [GCC type attributes](https://gcc.gnu.org/onlinedocs/gcc/Type-Attributes.html) -* [Previous part](linux-bootstrap-2.md) +- [QEMU](https://www.qemu.org/) +- [VGA](http://en.wikipedia.org/wiki/Video_Graphics_Array) +- [VESA BIOS Extensions](http://en.wikipedia.org/wiki/VESA_BIOS_Extensions) +- [Data structure alignment](http://en.wikipedia.org/wiki/Data_structure_alignment) +- [Non-maskable interrupt](http://en.wikipedia.org/wiki/Non-maskable_interrupt) +- [A20](http://en.wikipedia.org/wiki/A20_line) +- [Math coprocessor](https://en.wikipedia.org/wiki/Floating-point_unit) +- [PIC](https://en.wikipedia.org/wiki/Programmable_interrupt_controller) +- [Interrupts and exceptions](https://en.wikipedia.org/wiki/Interrupt) +- [Interrupt Vector Table](https://en.wikipedia.org/wiki/Interrupt_vector_table) +- [Protected mode](https://en.wikipedia.org/wiki/Protected_mode) +- [Intel VT](https://en.wikipedia.org/wiki/X86_virtualization#Intel_virtualization_(VT-x)) +- [Flat memory model](https://en.wikipedia.org/wiki/Flat_memory_model) +- [Previous part](linux-bootstrap-2.md) diff --git a/Booting/linux-bootstrap-4.md b/Booting/linux-bootstrap-4.md index 87f6f273..52d1a7d9 100644 --- a/Booting/linux-bootstrap-4.md +++ b/Booting/linux-bootstrap-4.md @@ -1,26 +1,50 @@ -Kernel booting process. Part 4. -================================================================================ +# Kernel booting process. Part 4. -The Transition to 64-bit mode --------------------------------------------------------------------------------- +In the previous [part](./linux-bootstrap-3.md), we saw the transition from the [real mode](https://en.wikipedia.org/wiki/Real_mode) into [protected mode](http://en.wikipedia.org/wiki/Protected_mode). At this point, the two crucial things were changed - the processor can address up to 4 gigabytes of memory and the privilege levels were set for the memory access. Despite this, the kernel is still in its early setup mode. There are many different things that has to be prepared and configured before we will reach the main kernel's entry point. Since we are learning the Linux kernel for `x86_64` processors, the protected mode is not the main mode where the processor should operate. The next crucial step is to switch to the native mode for `x86_64` - [long mode](https://en.wikipedia.org/wiki/Long_mode). -This is the fourth part of the `Kernel booting process`. Here, we will learn about the first steps taken in [protected mode](http://en.wikipedia.org/wiki/Protected_mode), like checking if the CPU supports [long mode](http://en.wikipedia.org/wiki/Long_mode) and [SSE](http://en.wikipedia.org/wiki/Streaming_SIMD_Extensions). We will initialize the page tables with [paging](http://en.wikipedia.org/wiki/Paging) and, at the end, transition the CPU to [long mode](https://en.wikipedia.org/wiki/Long_mode). +The main characteristic of this new mode, as with all the earlier modes - the way it defines the memory model. In real mode, the memory model was relatively simple and each memory location was formed based on the base address specified in a segment register and plus some offset. In protected mode, introduced Global and Local descriptor table with descriptors which describe memory areas. All the memory accesses in long mode are based on the new mechanism called [paging](https://en.wikipedia.org/wiki/Memory_paging). One of the crucial goal of the kernel before it can switch to the long mode is to setup paging. This and all other details needed to switch to long mode we will see in this chapter. -**NOTE: there will be lots of assembly code in this part, so if you are not familiar with that, you might want to consult a book about it** +> [!NOTE] +> There will be lots of assembly code in this part, so if you are not familiar with that, you might want to consult a book about it or read another set of my [posts](https://github.com/0xAX/asm) about assembly programming. -In the previous [part](https://github.com/0xAX/linux-insides/blob/v4.16/Booting/linux-bootstrap-3.md) we stopped at the jump to the `32-bit` entry point in [arch/x86/boot/pmjump.S](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/pmjump.S): +## The 32-bit kernel entry point location +The last point where we stopped was the [jump](https://en.wikipedia.org/wiki/Branch_(computer_science)#Implementation) to the kernel's entry point in protected mode. This jump is defined in the [arch/x86/boot/pmjump.S](https://github.com/torvalds/linux/blob/master/arch/x86/boot/pmjump.S) and looks like this: + + ```assembly -jmpl *%eax + jmpl *%eax # Jump to the 32-bit entrypoint +``` + +The value of the `eax` register contains the address of the `32-bit` entry point. What is this address? To answer on this question, we can read the [Linux kernel x86 boot protocol](https://www.kernel.org/doc/Documentation/x86/boot.txt) document: + +> When using bzImage, the protected-mode kernel was relocated to 0x100000 + +We can make make sure that this 32-bit entry point of the Linux kernel using the [GNU GDB](https://sourceware.org/gdb/) debugger and running the Linux kernel in the [QEMU](https://www.qemu.org/) virtual machine. To do this, you can run the following command in one terminal: + +```bash +sudo qemu-system-x86_64 -kernel ./linux/arch/x86/boot/bzImage \ + -nographic \ + -append "console=ttyS0 nokaslr" -s -S \ + -initrd /boot/initramfs-6.17.0-rc3-g1b237f190eb3.img ``` -You will recall that the `eax` register contains the address of the 32-bit entry point. We can read about this in the [linux kernel x86 boot protocol](https://www.kernel.org/doc/Documentation/x86/boot.txt): +> [!NOTE] +> You need to pass your own kernel image and [initrd](https://en.wikipedia.org/wiki/Initial_ramdisk) image to the `-kernel` and `-initrd` command line options. + +After this, run the GNU GDB debugger in another terminal and pass the following commands: ``` -When using bzImage, the protected-mode kernel was relocated to 0x100000 +$ gdb +(gdb) target remote :1234 +(gdb) hbreak *0x100000 +(gdb) c +Continuing. + +Breakpoint 1, 0x0000000000100000 in ?? () ``` -Let's make sure that this is so by looking at the register values at the 32-bit entry point: +As soon as the debugger stopped at the [breakpoint](https://en.wikipedia.org/wiki/Breakpoint), we can inspect registers to be sure that the `eax` register contains the `0x100000` - address of the 32-bit kernel entry point: ``` eax 0x100000 1048576 @@ -33,51 +57,92 @@ esi 0x14470 83056 edi 0x0 0 eip 0x100000 0x100000 eflags 0x46 [ PF ZF ] -cs 0x10 16 -ss 0x18 24 -ds 0x18 24 -es 0x18 24 -fs 0x18 24 -gs 0x18 24 ``` -We can see here that the `cs` register contains a value of `0x10` (as you might recall from the [previous part](https://github.com/0xAX/linux-insides/blob/v4.16/Booting/linux-bootstrap-3.md), this is the second index in the `Global Descriptor Table`), the `eip` register contains the value `0x100000` and the base address of all segments including the code segment are zero. +From the previous part, you may remember: + +> First of all, we preserve the address of `boot_params` structure in the `esi` register. + +So the `esi` register has the pointer to the `boot_params`. Let's inspect it to make sure that it is really it. For example we can take a look at the command line string that we passed to the virtual machine: + +``` +(gdb) x/s ((struct boot_params *)$rsi)->hdr.cmd_line_ptr +0x20000: "console=ttyS0 nokaslr" +(gdb) ptype struct boot_params +type = struct boot_params { + struct screen_info screen_info; + struct apm_bios_info apm_bios_info; + __u8 _pad2[4]; + __u64 tboot_addr; + struct ist_info ist_info; + __u64 acpi_rsdp_addr; + __u8 _pad3[8]; + __u8 hd0_info[16]; + __u8 hd1_info[16]; + struct sys_desc_table sys_desc_table; + struct olpc_ofw_header olpc_ofw_header; + __u32 ext_ramdisk_image; + __u32 ext_ramdisk_size; + __u32 ext_cmd_line_ptr; + __u8 _pad4[112]; + __u32 cc_blob_address; + struct edid_info edid_info; + struct efi_info efi_info; + __u32 alt_mem_k; + __u32 scratch; + __u8 e820_entries; + __u8 eddbuf_entries; + __u8 edd_mbr_sig_buf_entries; + __u8 kbd_status; + __u8 secure_boot; + __u8 _pad5[2]; + __u8 sentinel; + __u8 _pad6[1]; + struct setup_header hdr; + __u8 _pad7[36]; + __u32 edd_mbr_sig_buffer[16]; + struct boot_e820_entry e820_table[128]; + __u8 _pad8[48]; + struct edd_info eddbuf[6]; + __u8 _pad9[276]; +} +(gdb) x/s ((struct boot_params *)$rsi)->hdr.cmd_line_ptr +0x20000: "console=ttyS0 nokaslr" +``` + +We got it 🎉 -So, the physical address where the kernel is loaded would be `0:0x100000` or just `0x100000`, as specified by the boot protocol. Now let's start with the `32-bit` entry point. +Now we know where we are, so let's take a look at the code and proceed with learning of the Linux kernel. -The 32-bit entry point --------------------------------------------------------------------------------- +## First steps in the protected mode -The `32-bit` entry point is defined in the [arch/x86/boot/compressed/head_64.S](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/compressed/head_64.S) assembly source code file: +The `32-bit` entry point is defined in the [arch/x86/boot/compressed/head_64.S](https://github.com/torvalds/linux/blob/master/arch/x86/boot/compressed/head_64.S) assembly source code file: + ```assembly - __HEAD .code32 -ENTRY(startup_32) -.... -.... -.... -ENDPROC(startup_32) +SYM_FUNC_START(startup_32) ``` -First, why is the directory named `compressed`? The answer to that is that `bzimage` is a gzipped package consisting of `vmlinux`, `header` and ` kernel setup code`. We looked at kernel setup code in all of the previous parts. The main goal of the code in `head_64.S` is to prepare to enter long mode, enter it and then decompress the kernel. We will look at all of the steps leading to kernel decompression in this part. +First of all, it is worth to know is the directory named `compressed`? The answer to that is that the kernel is in the [`bzImage`](https://en.wikipedia.org/wiki/Vmlinux#bzImage) file. This file is a compressed package consisting of kernel image and the kernel setup code. In all previous chapters we were researching the kernel setup code. The next two big steps which remaining before we will see the entrypoint of the kernel are: + +- switch to long mode +- decompress the kernel image and jump to its entrypoint -You will find two files in the `arch/x86/boot/compressed` directory: +In this part we will focus at the first big step and the steps leading to the kernel decompression and the decompression itself we will see in the next chapters. Returning to the current kernel code, you may find the two following files in the [arch/x86/boot/compressed](https://github.com/torvalds/linux/tree/master/arch/x86/boot/compressed) directory: -* [head_32.S](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/compressed/head_32.S) -* [head_64.S](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/compressed/head_64.S) +- [head_32.S](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/compressed/head_32.S) +- [head_64.S](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/compressed/head_64.S) -but we will consider only the `head_64.S` source code file because, as you may remember, this book is only `x86_64` related; Let's look at [arch/x86/boot/compressed/Makefile](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/compressed/Makefile). We can find the following `make` target here: +In our case, we will consider only the `head_64.S` file. Yes, the file named with the `64` suffix despite the kernel is in the 32-bit protected mode for this moment. The explanation for this situation is simple. Let's look at [arch/x86/boot/compressed/Makefile](https://github.com/torvalds/linux/blob/master/arch/x86/boot/compressed/Makefile). We may see the following `make` goal here: ```Makefile -vmlinux-objs-y := $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \ - $(obj)/string.o $(obj)/cmdline.o \ +vmlinux-objs-y := $(obj)/vmlinux.lds $(obj)/kernel_info.o $(obj)/head_$(BITS).o \ + $(obj)/misc.o $(obj)/string.o $(obj)/cmdline.o $(obj)/error.o \ $(obj)/piggy.o $(obj)/cpuflags.o ``` -The first line contains this- `$(obj)/head_$(BITS).o`. - -This means that we will select which file to link based on what `$(BITS)` is set to, either `head_32.o` or `head_64.o`. The `$(BITS)` variable is defined elsewhere in [arch/x86/Makefile](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/Makefile) based on the kernel configuration: +The first line contains this the following target - `$(obj)/head_$(BITS).o`. This means, that `make` will select the file during kernel build process based on the value of the `$(BITS)`. This `make` variable defined in the [arch/x86/Makefile](https://github.com/torvalds/linux/blob/master/arch/x86/Makefile) make file and its value depends on the kernel configuration: ```Makefile ifeq ($(CONFIG_X86_32),y) @@ -91,259 +156,187 @@ else endif ``` -Now that we know where to start, let's get to it. +Since we are consider the kernel for `x86_64` architecture, we assume that the `CONFIG_X86_64` is set to `y`. As the result, the `head_64.S` file will be used during the kernel build process. Let's start to investigate this what the kernel does in this file. + +### Reload the segments if needed -Reload the segments if needed --------------------------------------------------------------------------------- +As we already know, our start is in the [arch/x86/boot/compressed/head_64.S](https://github.com/torvalds/linux/blob/master/arch/x86/boot/compressed/head_64.S) assembly source code file. The entry point is defined by the `startup_32` symbol. -As indicated above, we start in the [arch/x86/boot/compressed/head_64.S](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/boot/compressed/head_64.S) assembly source code file. We first see the definition of a special section attribute before the definition of the `startup_32` function: +In the beginning of the `startup_32`, we can see the `cld` instruction which clears the `DF` or [direction flag](https://en.wikipedia.org/wiki/Direction_flag) bit in the [flags](https://en.wikipedia.org/wiki/FLAGS_register) register: + ```assembly - __HEAD - .code32 -ENTRY(startup_32) + .code32 +SYM_FUNC_START(startup_32) + /* + * 32bit entry is 0 and it is ABI so immutable! + * If we come here directly from a bootloader, + * kernel(text+data+bss+brk) ramdisk, zero_page, command line + * all need to be under the 4G limit. + */ + cld ``` -`__HEAD` is a macro defined in the [include/linux/init.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/init.h) header file and expands to the definition of the following section: +When the direction flag is clear, all string operations which usually used for copying data, like for example [stos](http://x86.renejeschke.de/html/file_module_x86_id_306.html), [scas](http://x86.renejeschke.de/html/file_module_x86_id_287.html) and others, will increment the index registers `esi` or `edi`. We need to clear the direction flag because later we will use strings operations to perform various operations such as clearing space for page tables or copying data. -```C -#define __HEAD .section ".head.text","ax" -``` +The next instruction is to disable interrupts - `cli`. We already have seen it in previous chapter. The interrupts are disabled "twice" because modern bootloaders can load the kernel starting from this point but not only one that we have seen in the [first chapter](./linux-bootstrap-1.md). -Here, `.head.text` is the name of the section and `ax` is a set of flags. In our case, these flags show us that this section is [executable](https://en.wikipedia.org/wiki/Executable) or in other words contains code. We can find the definition of this section in the [arch/x86/boot/compressed/vmlinux.lds.S](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/boot/compressed/vmlinux.lds.S) linker script: +After these two simple instructions, the next step is to calculate the difference between where the kernel is compiled to run, and where it actually was loaded. If we will take a look at the linker [script](https://github.com/torvalds/linux/blob/master/arch/x86/boot/compressed/vmlinux.lds.S), we will see the following definition: -``` + +```linker-script SECTIONS { + /* Be careful parts of head_64.S assume startup_32 is at + * address 0. + */ . = 0; - .head.text : { - _head = . ; - HEAD_TEXT - _ehead = . ; - } - ... - ... - ... -} ``` -If you are not familiar with the syntax of the `GNU LD` linker scripting language, you can find more information in its [documentation](https://sourceware.org/binutils/docs/ld/Scripts.html#Scripts). In short, the `.` symbol is a special linker variable, the location counter. The value assigned to it is an offset relative to the segment. In our case, we set the location counter to zero. This means that our code is linked to run from an offset of `0` in memory. This is also stated in the comments: +This means that the code in this section is compiled to run at the address zero. We also can see this in the output of `objdump` utility: -``` -Be careful parts of head_64.S assume startup_32 is at address 0. -``` +```bash +$ objdump -D /home/alex/disk/dev/linux/arch/x86/boot/compressed/vmlinux | less -Now that we have our bearings, let's look at the contents of the `startup_32` function. +/home/alex/disk/dev/linux/arch/x86/boot/compressed/vmlinux: file format elf64-x86-64 -In the beginning of the `startup_32` function, we can see the `cld` instruction which clears the `DF` bit in the [flags](https://en.wikipedia.org/wiki/FLAGS_register) register. When the direction flag is clear, all string operations like [stos](http://x86.renejeschke.de/html/file_module_x86_id_306.html), [scas](http://x86.renejeschke.de/html/file_module_x86_id_287.html) and others will increment the index registers `esi` or `edi`. We need to clear the direction flag because later we will use strings operations to perform various operations such as clearing space for page tables. -After we have cleared the `DF` bit, the next step is to check the `KEEP_SEGMENTS` flag in the `loadflags` kernel setup header field. If you remember, we already talked about `loadflags` in the very first [part](https://0xax.gitbook.io/linux-insides/summary/booting/linux-bootstrap-1) of this book. There we checked the `CAN_USE_HEAP` flag to query the ability to use the heap. Now we need to check the `KEEP_SEGMENTS` flag. This flag is described in the linux [boot protocol](https://www.kernel.org/doc/Documentation/x86/boot.txt) documentation: +Disassembly of section .head.text: -``` -Bit 6 (write): KEEP_SEGMENTS - Protocol: 2.07+ - - If 0, reload the segment registers in the 32bit entry point. - - If 1, do not reload the segment registers in the 32bit entry point. - Assume that %cs %ds %ss %es are all set to flat segments with - a base of 0 (or the equivalent for their environment). +0000000000000000 : + 0: fc cld + 1: fa cli ``` -So, if the `KEEP_SEGMENTS` bit is not set in `loadflags`, we need to set the `ds`, `ss` and `es` segment registers to the index of the data segment with a base of `0`. That we do: +We may see that and the linker script and the `objdump` utility tells us that the address of the `startup_32` function is `0` but it is not where the kernel was loaded. This is only the address that the code was compiled for, also called link-time address. Why it was done like that? The answer is for simplicity. By telling the linker to set the address of the very first symbol to zero, each next symbol becomes a simple offset from 0. As we already know, the kernel was loaded at the `0x100000` address. The difference between this address and zero called relocation delta. Once that delta is known, the code can reach any variable or function by adding this delta to their compile-time addresses. -```C - testb $KEEP_SEGMENTS, BP_loadflags(%esi) - jnz 1f +We know these addresses and as the result the value of delta based on experiment we have seen above. Now let's take a look how the kernel calculates this difference: - cli - movl $(__BOOT_DS), %eax - movl %eax, %ds - movl %eax, %es - movl %eax, %ss + +```assembly + leal (BP_scratch+4)(%esi), %esp + call 1f +1: popl %ebp + subl $ rva(1b), %ebp ``` -Remember that `__BOOT_DS` is `0x18` (the index of the data segment in the [Global Descriptor Table](https://en.wikipedia.org/wiki/Global_Descriptor_Table)). If `KEEP_SEGMENTS` is set, we jump to the nearest `1f` label or update segment registers with `__BOOT_DS` if they are not set. This is all pretty easy, but here's something to consider. If you've read the previous [part](https://github.com/0xAX/linux-insides/blob/v4.16/Booting/linux-bootstrap-3.md), you may remember that we already updated these segment registers right after we switched to [protected mode](https://en.wikipedia.org/wiki/Protected_mode) in [arch/x86/boot/pmjump.S](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/pmjump.S). So why do we need to care about the values in the segment registers again? The answer is easy. The Linux kernel also has a 32-bit boot protocol and if a bootloader uses *that* to load the Linux kernel, all the code before the `startup_32` function will be missed. In this case, the `startup_32` function would be the first entry point to the Linux kernel right after the bootloader and there are no guarantees that the segment registers will be in a known state. - -After we have checked the `KEEP_SEGMENTS` flag and set the segment registers to a correct value, the next step is to calculate the difference between where the kernel is compiled to run, and where we loaded it. Remember that `setup.ld.S` contains the following definition: `. = 0` at the start of the `.head.text` section. This means that the code in this section is compiled to run at the address `0`. We can see this in the output of `objdump`: - -``` -arch/x86/boot/compressed/vmlinux: file format elf64-x86-64 +The `call` instruction is used to get the real address of the kernel. This trick works because after the `call` instruction is executed, the stack should have return address on its top. In the code above we setup a temporary mini stack to get the address of the kernel and execute the call to the nearest label `1`. Since the top of the stack contains the return address, we put it into the `ebp` register. Using the last instruction we subtract the difference between the address of the label `1` and `strtup_32` address from the return address that we got at the previous step: +![startup_32](./images/startup_32.svg) -Disassembly of section .head.text: +Starting from this moment, the `ebp` register will contain the address of the beginning of the kernel image and using it we can calculate offset to any other symbols or structures in memory. And the first such structure that we will access is the Global Descriptor Table. To switch to long mode, we need to update the previously loaded Global Descriptor Table with `64-bit` segments: -0000000000000000 : - 0: fc cld - 1: f6 86 11 02 00 00 40 testb $0x40,0x211(%rsi) + +```assembly + leal rva(gdt)(%ebp), %eax + movl %eax, 2(%eax) + lgdt (%eax) ``` -The `objdump` util tells us that the address of the `startup_32` function is `0` but that isn't so. We now need to know where we actually are. This is pretty simple to do in [long mode](https://en.wikipedia.org/wiki/Long_mode) because it supports `rip` relative addressing, but currently we are in [protected mode](https://en.wikipedia.org/wiki/Protected_mode). We will use a common pattern to find the address of the `startup_32` function. We need to define a label, make a call to it and pop the top of the stack to a register: +Where the new Global Descriptor Table is: + ```assembly -call label -label: pop %reg +SYM_DATA_START_LOCAL(gdt) + .word gdt_end - gdt - 1 + .long 0 + .word 0 + .quad 0x00cf9a000000ffff /* __KERNEL32_CS */ + .quad 0x00af9a000000ffff /* __KERNEL_CS */ + .quad 0x00cf92000000ffff /* __KERNEL_DS */ + .quad 0x0080890000000000 /* TS descriptor */ + .quad 0x0000000000000000 /* TS continued */ +SYM_DATA_END_LABEL(gdt, SYM_L_LOCAL, gdt_end) ``` -After this, the register indicated by `%reg` will contain the address of `label`. Let's look at the code which uses this pattern to search for the `startup_32` function in the Linux kernel: +The new Global Descriptor table contains five descriptors: -```assembly - leal (BP_scratch+4)(%esi), %esp - call 1f -1: popl %ebp - subl $1b, %ebp -``` +- `32-bit` kernel code segment +- `64-bit` kernel code segment +- `32-bit` kernel data segment +- Task state descriptor +- Second task state descriptor -As you remember from the previous part, the `esi` register contains the address of the [boot_params](https://github.com/torvalds/linux/blob/v4.16/arch/x86/include/uapi/asm/bootparam.h#L113) structure which was filled before we moved to the protected mode. The `boot_params` structure contains a special field `scratch` with an offset of `0x1e4`. This four byte field is a temporary stack for the `call` instruction. We set `esp` to the address four bytes after the `BP_scratch` field of the `boot_params` structure. We add `4` bytes to the base of the `BP_scratch` field because, as just described, it will be a temporary stack and the stack grows from the top to bottom in the `x86_64` architecture. So our stack pointer will point to the top of the temporary stack. Next, we can see the pattern that I've described above. We make a call to the `1f` label and pop the top of the stack onto `ebp`. This works because `call` stores the return address of the current function on the top of the stack. We now have the address of the `1f` label and can now easily get the address of the `startup_32` function. We just need to subtract the address of the label from the address we got from the stack: +We already saw the loading the Global Descriptor Table in the previous [part](./linux-bootstrap-3.md#set-up-global-descriptor-table), and now we're doing almost the same here, but we set descriptors to use `CS.L = 1` and `CS.D = 0` for execution in 64 bit mode. -``` -startup_32 (0x0) +-----------------------+ - | | - | | - | | - | | - | | - | | - | | - | | -1f (0x0 + 1f offset) +-----------------------+ %ebp - real physical address - | | - | | - +-----------------------+ -``` +After the new Global Descriptor Table is loaded, the kernel can setup the new stack: -The `startup_32` function is linked to run at the address `0x0` and this means that `1f` has the address `0x0 + offset to 1f`, which is approximately `0x21` bytes. The `ebp` register contains the real physical address of the `1f` label. So, if we subtract `1f` from the `ebp` register, we will get the real physical address of the `startup_32` function. The Linux kernel [boot protocol](https://www.kernel.org/doc/Documentation/x86/boot.txt) says the base of the protected mode kernel is `0x100000`. We can verify this with [gdb](https://en.wikipedia.org/wiki/GNU_Debugger). Let's start the debugger and add a breakpoint at the address of `1f`, which is `0x100021`. If this is correct we will see the value `0x100021` in the `ebp` register: + +```assembly + movl $__BOOT_DS, %eax + movl %eax, %ds + movl %eax, %es + movl %eax, %fs + movl %eax, %gs + movl %eax, %ss + /* Setup a stack and load CS from current GDT */ + leal rva(boot_stack_end)(%ebp), %esp ``` -$ gdb -(gdb)$ target remote :1234 -Remote debugging using :1234 -0x0000fff0 in ?? () -(gdb)$ br *0x100022 -Breakpoint 1 at 0x100022 -(gdb)$ c -Continuing. -Breakpoint 1, 0x00100022 in ?? () -(gdb)$ i r -eax 0x18 0x18 -ecx 0x0 0x0 -edx 0x0 0x0 -ebx 0x0 0x0 -esp 0x144a8 0x144a8 -ebp 0x100021 0x100021 -esi 0x142c0 0x142c0 -edi 0x0 0x0 -eip 0x100022 0x100022 -eflags 0x46 [ PF ZF ] -cs 0x10 0x10 -ss 0x18 0x18 -ds 0x18 0x18 -es 0x18 0x18 -fs 0x18 0x18 -gs 0x18 0x18 -``` +At the previous step we loaded new Global Descriptor Table, but all the segment registers may have selectors from old table. If those selectors point to invalid entries in the new Global Descriptor Table, next memory access can cause [General Protection Fault](https://en.wikipedia.org/wiki/General_protection_fault). Setting them to the `__BOOT_DS` which is a known-good descriptor should fix this potential fault and allow us to set proper stack pointed by the `boot_stack_end`. -If we execute the next instruction, `subl $1b, %ebp`, we will see: +The last action after we loaded the new Global Descriptor Table is to reload `cs` descriptor: -``` -(gdb) nexti -... -... -... -ebp 0x100000 0x100000 -... -... -... + +```assembly + pushl $__KERNEL32_CS + leal rva(1f)(%ebp), %eax + pushl %eax + lretl +1: ``` -Ok, we've verified that the address of the `startup_32` function is `0x100000`. After we know the address of the `startup_32` label, we can prepare for the transition to [long mode](https://en.wikipedia.org/wiki/Long_mode). Our next goal is to setup the stack and verify that the CPU supports long mode and [SSE](http://en.wikipedia.org/wiki/Streaming_SIMD_Extensions). +Since we can not change segment registers using simple `mov` instruction, we need to apply a trick with the `lretl` instruction. This instruction fetches the two values from the top of the stack and put the first value into the `eip` register and the second value to the `cs` register. Since this moment we have proper kernel code selector and instruction pointer values. -Stack setup and CPU verification --------------------------------------------------------------------------------- +Just a couple of steps separate us from the transition into the long mode. As it was mentioned in the beginning of this chapter, one of the most crucial is to setup `paging`. But before this task, the kernel needs to do last preparations which we will see in the next sections. -We can't set up the stack until we know where in memory the `startup_32` label is. If we imagine the stack as an array, the stack pointer register `esp` must point to the end of it. Of course, we can define an array in our code, but we need to know its actual address to configure the stack pointer correctly. Let's look at the code: +## Last steps before paging setup -```assembly - movl $boot_stack_end, %eax - addl %ebp, %eax - movl %eax, %esp -``` +As we mentioned in the previous section, there a couple of additional steps before we can setup paging and switch to long mode. These steps are: -The `boot_stack_end` label is also defined in the [arch/x86/boot/compressed/head_64.S](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/compressed/head_64.S) assembly source code file and is located in the [.bss](https://en.wikipedia.org/wiki/.bss) section: +- Verification of CPU +- Calculation of the relocation address +- Enabling `PAE` mode -```assembly - .bss - .balign 4 -boot_heap: - .fill BOOT_HEAP_SIZE, 1, 0 -boot_stack: - .fill BOOT_STACK_SIZE, 1, 0 -boot_stack_end: -``` +In the next sections we will take a look at these steps. -First of all, we put the address of `boot_stack_end` into the `eax` register, so the `eax` register contains the address of `boot_stack_end` as it was linked, which is `0x0 + boot_stack_end`. To get the real address of `boot_stack_end`, we need to add the real address of the `startup_32` function. We've already found this address and put it into the `ebp` register. In the end, the `eax` register will contain the real address of `boot_stack_end` and we just need to set the stack pointer to it. +### CPU verification -After we have set up the stack, the next step is CPU verification. Since we are transitioning to `long mode`, we need to check that the CPU supports `long mode` and `SSE`. We will do this with a call to the `verify_cpu` function: +Before we the kernel can switch to long mode, it needs to check that it runs on the suitable `x86_64` processor. This is done by the next piece of code: + ```assembly + /* Make sure cpu supports long mode. */ call verify_cpu testl %eax, %eax - jnz no_longmode + jnz .Lno_longmode ``` -This function is defined in the [arch/x86/kernel/verify_cpu.S](https://github.com/torvalds/linux/blob/v4.16/arch/x86/kernel/verify_cpu.S) assembly file and just contains a couple of calls to the [cpuid](https://en.wikipedia.org/wiki/CPUID) instruction. This instruction is used to get information about the processor. In our case, it checks for `long mode` and `SSE` support and sets the `eax` register to `0` on success and `1` on failure. - -If the value of `eax` is not zero, we jump to the `no_longmode` label which just stops the CPU with the `hlt` instruction while no hardware interrupt can happen: +The `verify_cpu` function defined in the [arch/x86/kernel/verify_cpu.S](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/verify_cpu.S) and executes the [cpuid](https://en.wikipedia.org/wiki/CPUID) instruction to check the details of the processors on which kernel is running on. In our case, the most crucial check is for `long mode` and [SSE](http://en.wikipedia.org/wiki/Streaming_SIMD_Extensions) support. and sets the `eax` register to `0` on success and `1` on failure. If the long mode is not supported by the current processor, the kernel jumps to the `no_longmode` label which just stops the CPU with the `hlt` instruction: + ```assembly -no_longmode: + .code32 +SYM_FUNC_START_LOCAL_NOALIGN(.Lno_longmode) + /* This isn't an x86-64 CPU, so hang intentionally, we cannot continue */ 1: hlt jmp 1b ``` -If the value of the `eax` register is zero, everything is ok and we can continue. - -Calculate the relocation address --------------------------------------------------------------------------------- - -The next step is to calculate the relocation address for decompression if needed. First, we need to know what it means for a kernel to be `relocatable`. We already know that the base address of the 32-bit entry point of the Linux kernel is `0x100000`, but that is a 32-bit entry point. The default base address of the Linux kernel is determined by the value of the `CONFIG_PHYSICAL_START` kernel configuration option. Its default value is `0x1000000` or `16 MB`. The main problem here is that if the Linux kernel crashes, a kernel developer must have a `rescue kernel` for [kdump](https://www.kernel.org/doc/Documentation/kdump/kdump.txt) which is configured to load from a different address. The Linux kernel provides a special configuration option to solve this problem: `CONFIG_RELOCATABLE`. As we can read in the documentation of the Linux kernel: - -``` -This builds a kernel image that retains relocation information -so it can be loaded someplace besides the default 1MB. - -Note: If CONFIG_RELOCATABLE=y, then the kernel runs from the address -it has been loaded at and the compile time physical address -(CONFIG_PHYSICAL_START) is used as the minimum location. -``` - -Now that we know where to start, let's get to it. +If everything is ok, the kernel proceeds its work. -Reload the segments if needed --------------------------------------------------------------------------------- +### Calculation of the kernel relocation address -As indicated above, we start in the [arch/x86/boot/compressed/head_64.S](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/boot/compressed/head_64.S) assembly source code file. We first see the definition of a special section attribute before the definition of the `startup_32` function: +The next step is to calculate the address for the kernel decompression. The kernel consists of two parts: -```assembly - __HEAD - .code32 -ENTRY(startup_32) -``` - -`__HEAD` is a macro defined in the [include/linux/init.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/init.h) header file and expands to the definition of the following section: - -```C -#define __HEAD .section ".head.text","ax" -``` - -Here, `.head.text` is the name of the section and `ax` is a set of flags. In our case, these flags show us that this section is [executable](https://en.wikipedia.org/wiki/Executable). In simple terms, this means that a Linux kernel with this option set can be booted from different addresses. Technically, this is done by compiling the decompressor as [position independent code](https://en.wikipedia.org/wiki/Position-independent_code). If we look at [arch/x86/boot/compressed/Makefile](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/compressed/Makefile), we can see that the decompressor is indeed compiled with the `-fPIC` flag: - -```Makefile -KBUILD_CFLAGS += -fno-strict-aliasing -fPIC -``` +- Relatively small decompressor code +- Chunk of compressed kernel code -When we are using position-independent code an address is obtained by adding the address field of the instruction to the value of the program counter. We can load code which uses such addressing from any address. That's why we had to get the real physical address of `startup_32`. Now let's get back to the Linux kernel code. Our current goal is to calculate an address where we can relocate the kernel for decompression. The calculation of this address depends on the `CONFIG_RELOCATABLE` kernel configuration option. Let's look at the code: +Obviously, the final decompressed kernel code will be bigger than compressed image. The memory area where the decompressed kernel should locate may overlap with the area where the compressed image is located. In this case, the compressed image could be overwritten during decompression process. To avoid this, the the kernel will copy the compressed part for safe decompression. This is done by the following code: + ```assembly #ifdef CONFIG_RELOCATABLE movl %ebp, %ebx @@ -353,212 +346,140 @@ When we are using position-independent code an address is obtained by adding the notl %eax andl %eax, %ebx cmpl $LOAD_PHYSICAL_ADDR, %ebx - jge 1f + jae 1f #endif movl $LOAD_PHYSICAL_ADDR, %ebx -``` - -Remember that the value of the `ebp` register is the physical address of the `startup_32` label. If the `CONFIG_RELOCATABLE` kernel configuration option is enabled during kernel configuration, we put this address in the `ebx` register, align it to a multiple of `2MB` and compare it with the result of the `LOAD_PHYSICAL_ADDR` macro. `LOAD_PHYSICAL_ADDR` is defined in the [arch/x86/include/asm/boot.h](https://github.com/torvalds/linux/blob/v4.16/arch/x86/include/asm/boot.h) header file and it looks like this: - -```C -#define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \ - + (CONFIG_PHYSICAL_ALIGN - 1)) \ - & ~(CONFIG_PHYSICAL_ALIGN - 1)) -``` - -As we can see it just expands to the aligned `CONFIG_PHYSICAL_ALIGN` value which represents the physical address where the kernel will be loaded. After comparing `LOAD_PHYSICAL_ADDR` and the value of the `ebx` register, we add the offset from `startup_32` where we will decompress the compressed kernel image. If the `CONFIG_RELOCATABLE` option is not enabled during kernel configuration, we just add `z_extract_offset` to the default address where the kernel is loaded. - -After all of these calculations, `ebp` will contain the address where we loaded the kernel and `ebx` will contain the address where the decompressed kernel will be relocated. But that is not the end. The compressed kernel image should be moved to the end of the decompression buffer to simplify calculations regarding where the kernel will be located later. For this: - -```assembly 1: - movl BP_init_size(%esi), %eax - subl $_end, %eax - addl %eax, %ebx -``` - -we put the value from the `boot_params.BP_init_size` field (or the kernel setup header value from `hdr.init_size`) in the `eax` register. The `BP_init_size` field contains the larger of the compressed and uncompressed [vmlinux](https://en.wikipedia.org/wiki/Vmlinux) sizes. Next we subtract the address of the `_end` symbol from this value and add the result of the subtraction to the `ebx` register which will store the base address for kernel decompression. - -Preparation before entering long mode --------------------------------------------------------------------------------- - -After we get the address to relocate the compressed kernel image to, we need to do one last step before we can transition to 64-bit mode. First, we need to update the [Global Descriptor Table](https://en.wikipedia.org/wiki/Global_Descriptor_Table) with 64-bit segments because a relocatable kernel is runnable at any address below 512GB: - -```assembly - addl %ebp, gdt+2(%ebp) - lgdt gdt(%ebp) -``` -Here we adjust the base address of the Global Descriptor table to the address where we actually loaded the kernel and load the `Global Descriptor Table` with the `lgdt` instruction. - -To understand the magic with `gdt` offsets we need to look at the definition of the `Global Descriptor Table`. We can find its definition in the same source code [file](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/compressed/head_64.S): - -```assembly - .data -gdt64: - .word gdt_end - gdt - .long 0 - .word 0 - .quad 0 -gdt: - .word gdt_end - gdt - .long gdt - .word 0 - .quad 0x00cf9a000000ffff /* __KERNEL32_CS */ - .quad 0x00af9a000000ffff /* __KERNEL_CS */ - .quad 0x00cf92000000ffff /* __KERNEL_DS */ - .quad 0x0080890000000000 /* TS descriptor */ - .quad 0x0000000000000000 /* TS continued */ -gdt_end: + /* Target address to relocate to for decompression */ + addl BP_init_size(%esi), %ebx + subl $ rva(_end), %ebx ``` -We can see that it is located in the `.data` section and contains five descriptors: the first is a `32-bit` descriptor for the kernel code segment, a `64-bit` kernel segment, a kernel data segment and two task descriptors. +The `ebp` register contains current address of the beginning of the kernel image. We put this address to the `ebx` register and aligned it by the `2MB` border. If the resulted address equal or bigger than `LOAD_PHYSICAL_ADDRESS` which is `0x1000000` we use it as is, otherwise we set it to `0x1000000`. Since we have the beginning of the address where to move the compressed kernel image, we add to it `BP_init_size` which is the size of decompressed kernel image. This will allow us to copy compressed kernel image behind the memory area where the kernel will be decompressed. In the end we just subtract the address of the `_end` from the value in the `ebx` to get the new base address of the decompressor kernel code. -We already loaded the `Global Descriptor Table` in the previous [part](https://github.com/0xAX/linux-insides/blob/v4.16/Booting/linux-bootstrap-3.md), and now we're doing almost the same here, but we set descriptors to use `CS.L = 1` and `CS.D = 0` for execution in `64` bit mode. As we can see, the definition of the `gdt` starts with a two byte value: `gdt_end - gdt` which represents the address of the last byte in the `gdt` table or the table limit. The next four bytes contain the base address of the `gdt`. +### Enabling PAE mode -After we have loaded the `Global Descriptor Table` with the `lgdt` instruction, we must enable [PAE](http://en.wikipedia.org/wiki/Physical_Address_Extension) by putting the value of the `cr4` register into `eax`, setting the 5th bit and loading it back into `cr4`: +The next step is to setup so-called `PAE` mode: + ```assembly + /* Enable PAE mode */ movl %cr4, %eax orl $X86_CR4_PAE, %eax movl %eax, %cr4 ``` -Now we are almost finished with the preparations needed to move into 64-bit mode. The last step is to build page tables, but before that, here is some information about long mode. - -Long mode --------------------------------------------------------------------------------- - -[Long mode](https://en.wikipedia.org/wiki/Long_mode) is the native mode for [x86_64](https://en.wikipedia.org/wiki/X86-64) processors. First, let's look at some differences between `x86_64` and `x86`. - -`64-bit` mode provides the following features: - -* 8 new general purpose registers from `r8` to `r15` -* All general purpose registers are 64-bit now -* A 64-bit instruction pointer - `RIP` -* A new operating mode - Long mode; -* 64-Bit Addresses and Operands; -* RIP Relative Addressing (we will see an example of this in the coming parts). - -Long mode is an extension of the legacy protected mode. It consists of two sub-modes: +We doing it by setting the `X86_CR4_PAE` bit in the `cr4` [control register](https://en.wikipedia.org/wiki/Control_register). This tells to CPU that the page table entries that we will see soon will be enlarged from `32` to `64` bits. -* 64-bit mode; -* compatibility mode. +## Setup paging -To switch into `64-bit` mode we need to do the following things: +At this moment we are almost finished with the preparations needed to switch the processor into the 64-bit mode. One of the last step, is to build [page tables](https://en.wikipedia.org/wiki/Page_table). But before we will take a look at the process of page tables setup, let's try briefly understand what is it. -* Enable [PAE](https://en.wikipedia.org/wiki/Physical_Address_Extension); -* Build page tables and load the address of the top level page table into the `cr3` register; -* Enable `EFER.LME`; -* Enable paging. +As we mentioned in the beginning of this chapter - on `x86_64`, the processor must have paging enabled to use long mode. Paging lets the processor translate [virtual addresses](https://en.wikipedia.org/wiki/Virtual_address_space) or addresses used by the code, into a [physical addresses](https://en.wikipedia.org/wiki/Physical_address). The translation of virtual addresses into physical done using the special structure - page tables. All the memory considered as array of sequential blocks called pages. Each page is described by the special descriptor in the page table called `PTE` or page table entry. The page table entries are stored in the special structure called page tables. The page table is a structure with predefined hierarchy: -We already enabled `PAE` by setting the `PAE` bit in the `cr4` control register. Our next goal is to build the structure for [paging](https://en.wikipedia.org/wiki/Paging). We will discuss this in the next paragraph. +- `PML4` - top level table, each entry points to `PDPT` +- `PDPT` - 3rd level table, each entry points to `PD` +- `PD` - 2nd level table, each entry poitns to `PT` +- `PT` - 1st level table, each entry points to a 4 killobyte physical page -Early page table initialization --------------------------------------------------------------------------------- +The physical address of the top level table must be stored in the `cr3` register. -We already know that before we can move into `64-bit` mode, we need to build page tables. Let's look at how the early `4G` boot page tables are built. +When the processor needs to translate a virtual address into the corresponding physical address, it splits the virtual address to the next parts: -**NOTE: I will not describe the theory of virtual memory here. If you want to know more about virtual memory, check out the links at the end of this part.** +![early-page-table.svg](./images/early-page-table.svg) -The Linux kernel uses `4-level` paging, and we generally build 6 page tables: +Knowing the index of the corresponding entry in each table, CPU obtains the physical address. -* One `PML4` or `Page Map Level 4` table with one entry; -* One `PDP` or `Page Directory Pointer` table with four entries; -* Four Page Directory tables with a total of `2048` entries. - -Let's look at how this is implemented. First, we clear the buffer for the page tables in memory. Every table is `4096` bytes, so we need clear a `24` kilobyte buffer: +The next goal of the kernel is to build a structure similar to the description above to switch to long mode. Let's take a look how it is implemented in the kernel. First of all we need to fill the current page table structure specified by the [pgtable](https://github.com/torvalds/linux/blob/master/arch/x86/boot/compressed/head_64.S#L533) with zeros for safeness: + ```assembly - leal pgtable(%ebx), %edi + leal rva(pgtable)(%ebx), %edi xorl %eax, %eax movl $(BOOT_INIT_PGT_SIZE/4), %ecx rep stosl ``` -We put the address of `pgtable` with an offset of `ebx` (remember that `ebx` points to the location in memory where the kernel will be decompressed later) into the `edi` register, clear the `eax` register and set the `ecx` register to `6144`. - -The `rep stosl` instruction will write the value of `eax` to the memory location where `edi` points to, increment `edi` by `4`, and decrement `ecx` by `1`. This operation will be repeated while the value of the `ecx` register is greater than zero. That's why we put `6144` or `BOOT_INIT_PGT_SIZE/4` in `ecx`. - -`pgtable` is defined at the end of the [arch/x86/boot/compressed/head_64.S](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/compressed/head_64.S) assembly file: +After we cleaned the memory area for the page tables, we can start to fill it. First of all, we need to fill the top-level page entry: + ```assembly - .section ".pgtable","a",@nobits - .balign 4096 -pgtable: - .fill BOOT_PGT_SIZE, 1, 0 + leal rva(pgtable + 0)(%ebx), %edi + leal 0x1007 (%edi), %eax + movl %eax, 0(%edi) + addl %edx, 4(%edi) ``` -As we can see, it is located in the `.pgtable` section and its size depends on the `CONFIG_X86_VERBOSE_BOOTUP` kernel configuration option: +This adds the first entry to the top-level page table. This entry will contain a reference to the first entry of the lower-level table. The offset to it is `0x1000` bytes. The `0x7` are flags of the page table entry: -```C -# ifdef CONFIG_X86_VERBOSE_BOOTUP -# define BOOT_PGT_SIZE (19*4096) -# else /* !CONFIG_X86_VERBOSE_BOOTUP */ -# define BOOT_PGT_SIZE (17*4096) -# endif -# else /* !CONFIG_RANDOMIZE_BASE */ -# define BOOT_PGT_SIZE BOOT_INIT_PGT_SIZE -# endif -``` +- Present +- Read/Write +- User -After we have a buffer for the `pgtable` structure, we can start to build the top level page table - `PML4` - with: +Each page entry is `64-bit` structure, no matter if it is a `PML4`, `PDPT`, `PD` or `PT` entry. The format is almost the same among all the levels. The difference is only in the address field which stores the physical address of the next page table by hierarchy. Besides the address field, a page table entry contains flags like: -```assembly - leal pgtable + 0(%ebx), %edi - leal 0x1007 (%edi), %eax - movl %eax, 0(%edi) -``` +- `P` - present bit +- `RW` - read/write bit +- `US` - user/supervisor bit +- `PWT` - Page-level Write-Through bit controlling caching of the page +- `PCD` - Page Cache Disable bit controlling caching of the page +- `A` - accessed page bit +- `D` - dirty page bit +- `PS` - page size bit +- `NX` - No-Execute bit -Here again, we put the address of `pgtable` relative to `ebx` or in other words relative to address of `startup_32` in the `edi` register. Next, we put this address with an offset of `0x1007` into the `eax` register. `0x1007` is the result of adding the size of the `PML4` table which is `4096` or `0x1000` bytes with `7`. The `7` here represents the flags associated with the `PML4` entry. In our case, these flags are `PRESENT+RW+USER`. In the end, we just write the address of the first `PDP` entry to the `PML4` table. +More information about the page tables and page table entries structure you can find in the [Intel® 64 and IA-32 Architectures Software Developer Manuals](https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sdm.html). -In the next step we will build four `Page Directory` entries in the `Page Directory Pointer` table with the same `PRESENT+RW+USE` flags: +In the next step we will build four `Page Directory` entries in the `Page Directory Pointer` table with the same `Present+Read/Write/User` flags: + ```assembly - leal pgtable + 0x1000(%ebx), %edi + leal rva(pgtable + 0x1000)(%ebx), %edi leal 0x1007(%edi), %eax movl $4, %ecx -1: movl %eax, 0x00(%edi) +1: movl %eax, 0x00(%edi) + addl %edx, 0x04(%edi) addl $0x00001000, %eax addl $8, %edi decl %ecx jnz 1b ``` -We set `edi` to the base address of the page directory pointer which is at an offset of `4096` or `0x1000` bytes from the `pgtable` table and `eax` to the address of the first page directory pointer entry. We also set `ecx` to `4` to act as a counter in the following loop and write the address of the first page directory pointer table entry to the `edi` register. After this, `edi` will contain the address of the first page directory pointer entry with flags `0x7`. Next we calculate the address of the following page directory pointer entries — each entry is `8` bytes — and write their addresses to `eax`. The last step in building the paging structure is to build the `2048` page table entries with `2-MByte` pages: +In the code above, we may see filling of the first four entries of the 3rd level page table. The first entry is located at the offset `0x1000` from the beginning of the page table. The value of the `eax` register is similar to the 4th level page table entry. Next we just fill the four entries of this table in the "loop" while value of the `ecx` will not be zero. As soon as this table entries are filled, the next turn of the next level page table: + ```assembly - leal pgtable + 0x2000(%ebx), %edi + leal rva(pgtable + 0x2000)(%ebx), %edi movl $0x00000183, %eax movl $2048, %ecx -1: movl %eax, 0(%edi) +1: movl %eax, 0(%edi) + addl %edx, 4(%edi) addl $0x00200000, %eax addl $8, %edi decl %ecx jnz 1b ``` -Here we do almost the same things that we did in the previous example, all entries are associated with these flags - `$0x00000183` - `PRESENT + WRITE + MBZ`. In the end, we will have a page table with `2048` `2-MByte` pages, which represents a 4 Gigabyte block of memory: +Here we already fill 4 page directories with 2048 entries. The first entry is located at the offset `0x2000` from the beginning of the page table. Each entry maps a 2 megabytes chunk of memory with the same `Present/Read/Write/Large Page` flags but in addition there is `Global` flag. This additional flag tells the processor to keep [TLB](https://en.wikipedia.org/wiki/Translation_lookaside_buffer) entry across reload of the value of the `cr3` register. -```python ->>> 2048 * 0x00200000 -4294967296 -``` +This was the last page table entries which kernel fills. There is no need for this moment to fill the 4th level `PT` tables because every at the 2nd level page table was filled with the `Large Page` bit, so each such entry directly maps a 2 megabytes region. During the address transition, the page-walk procedure stops at the `PD` level going through `PML4 → PDPT → PD`, and the lower `21` bits of the virtual address will be used as the offset inside that 2 megabytes page. -Since we've just finished building our early page table structure which maps `4` gigabytes of memory, we can put the address of the high-level page table - `PML4` - into the `cr3` control register: +Now we can enable the paging by storing the address of the page table in the `cr3` register: + ```assembly - leal pgtable(%ebx), %eax + leal rva(pgtable)(%ebx), %eax movl %eax, %cr3 ``` -That's all. We are now prepared to transition to long mode. +The page tables is ready and paging is enabled starting from this moment. Now the kernel is prepared for transition into the long mode. -The transition to 64-bit mode --------------------------------------------------------------------------------- +## The transition into 64-bit mode -First of all we need to set the `EFER.LME` flag in the [MSR](http://en.wikipedia.org/wiki/Model-specific_register) to `0xC0000080`: +Only the last steps are remaining before the Linux kernel can switch CPU into the long mode. The first one is setting the `EFER.LME` flag in the special [model specific register](http://en.wikipedia.org/wiki/Model-specific_register) to the predefined value `0xC0000080`: + ```assembly movl $MSR_EFER, %ecx rdmsr @@ -566,64 +487,52 @@ First of all we need to set the `EFER.LME` flag in the [MSR](http://en.wikipedia wrmsr ``` -Here we put the `MSR_EFER` flag (which is defined in [arch/x86/include/asm/msr-index.h](https://github.com/torvalds/linux/blob/v4.16/arch/x86/include/asm/msr-index.h)) in the `ecx` register and execute the `rdmsr` instruction which reads the [MSR](http://en.wikipedia.org/wiki/Model-specific_register) register. After `rdmsr` executes, the resulting data is stored in `edx:eax` according to the `MSR` register specified in `ecx`. We check the current `EFER_LME` bit, transfer it into the carry flag and update the bit, all with the `btsl` instruction. Then we write data from `edx:eax` back to the `MSR` register with the `wrmsr` instruction. +This is the `Long Mode Enable` bit and it is mandatory action to set this bit to enable `64-bit` mode. -In the next step, we push the address of the kernel segment code to the stack (we defined it in the GDT) and put the address of the `startup_64` routine in `eax`. +In the next step, we may see the preparation of the jump on the long mode entrypoint. To do this jump, the kernel stores the base address of the kernel segment code along with the address of the long mode entrypoint on the stack: + ```assembly + leal rva(startup_64)(%ebp), %eax pushl $__KERNEL_CS - leal startup_64(%ebp), %eax -``` - -After this we push `eax` to the stack and enable paging by setting the `PG` and `PE` bits in the `cr0` register: - -```assembly pushl %eax - movl $(X86_CR0_PG | X86_CR0_PE), %eax - movl %eax, %cr0 ``` -We then execute the `lret` instruction: +Everything is ready. Since our stack contains the base of the kernel code segment and the address of the entrypoint, kernel executes the last instruction in protected mode: + ```assembly -lret + lret ``` -Remember that we pushed the address of the `startup_64` function to the stack in the previous step. The CPU extracts `startup_64`'s address from the stack and jumps there. - -After all of these steps we're finally in 64-bit mode: +The CPU extracts the address of the `startup_64` from the stack and jumps there: + ```assembly .code64 .org 0x200 -ENTRY(startup_64) -.... -.... -.... +SYM_CODE_START(startup_64) ``` -That's all! +The Linux kernel now in 64-bit mode 🎉 -Conclusion --------------------------------------------------------------------------------- -This is the end of the fourth part of the linux kernel booting process. If you have any questions or suggestions, ping me on twitter [0xAX](https://twitter.com/0xAX), drop me an [email](mailto:anotherworldofworld@gmail.com) or just create an [issue](https://github.com/0xAX/linux-insides/issues/new). +## Conclusion -In the next part, we will learn about many things, including how kernel decompression works. +This is the end of the third part about Linux kernel insides. If you have questions or suggestions, feel free ping me on X - [0xAX](https://twitter.com/0xAX), drop me an [email](mailto:anotherworldofworld@gmail.com), or just create an [issue](https://github.com/0xAX/linux-insides/issues/new). -**Please note that English is not my first language and I am really sorry for any inconvenience. If you find any mistakes please send a PR to [linux-insides](https://github.com/0xAX/linux-internals).** +## Links -Links --------------------------------------------------------------------------------- +Here is the list of the links that you may find useful during reading of this chapter: -* [Protected mode](http://en.wikipedia.org/wiki/Protected_mode) -* [Intel® 64 and IA-32 Architectures Software Developer’s Manual 3A](http://www.intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html) -* [GNU linker](http://www.eecs.umich.edu/courses/eecs373/readings/Linker.pdf) -* [SSE](http://en.wikipedia.org/wiki/Streaming_SIMD_Extensions) -* [Paging](http://en.wikipedia.org/wiki/Paging) -* [Model specific register](http://en.wikipedia.org/wiki/Model-specific_register) -* [.fill instruction](http://www.chemie.fu-berlin.de/chemnet/use/info/gas/gas_7.html) -* [Previous part](https://github.com/0xAX/linux-insides/blob/v4.16/Booting/linux-bootstrap-3.md) -* [Paging on osdev.org](http://wiki.osdev.org/Paging) -* [Paging Systems](https://www.cs.rutgers.edu/~pxk/416/notes/09a-paging.html) -* [x86 Paging Tutorial](http://www.cirosantilli.com/x86-paging/) +- [Real mode](https://en.wikipedia.org/wiki/Real_mode) +- [Protected mode](http://en.wikipedia.org/wiki/Protected_mode) +- [Long mode](https://en.wikipedia.org/wiki/Long_mode) +- [Linux kernel x86 boot protocol](https://www.kernel.org/doc/Documentation/x86/boot.txt) +- [Intel® 64 and IA-32 Architectures Software Developer Manuals](https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sdm.html) +- [Paging](http://en.wikipedia.org/wiki/Paging) +- [Virtual addresses](https://en.wikipedia.org/wiki/Virtual_address_space) +- [Physical addresses](https://en.wikipedia.org/wiki/Physical_address) +- [Model specific registers](http://en.wikipedia.org/wiki/Model-specific_register) +- [Control registers](https://en.wikipedia.org/wiki/Control_register) +- [Previous part](https://github.com/0xAX/linux-insides/blob/v4.16/Booting/linux-bootstrap-3.md) diff --git a/Booting/linux-bootstrap-5.md b/Booting/linux-bootstrap-5.md index a72b7f5a..94124f36 100644 --- a/Booting/linux-bootstrap-5.md +++ b/Booting/linux-bootstrap-5.md @@ -204,7 +204,7 @@ Like before, we push `rsi` onto the stack to preserve the pointer to `boot_param * `output` - the start address of the decompressed kernel; * `output_len` - the size of the decompressed kernel; -All arguments will be passed through registers as per the [System V Application Binary Interface](http://www.x86-64.org/documentation/abi.pdf). We've finished all the preparations and can now decompress the kernel. +All arguments will be passed through registers as per the [System V Application Binary Interface](https://github.com/hjl-tools/x86-psABI/wiki/x86-64-psABI-1.0.pdf). We've finished all the preparations and can now decompress the kernel. Kernel decompression -------------------------------------------------------------------------------- @@ -383,13 +383,13 @@ That's all. Now we are in the kernel! Conclusion -------------------------------------------------------------------------------- -This is the end of the fifth part about the linux kernel booting process. We will not see any more posts about the kernel booting process (there may be updates to this and previous posts though), but there will be many posts about other kernel internals. +This is the end of the fifth part about the Linux kernel booting process. We will not see any more posts about the kernel booting process (there may be updates to this and previous posts though), but there will be many posts about other kernel internals. -The Next chapter will describe more advanced details about linux kernel booting process, like load address randomization and etc. +The Next chapter will describe more advanced details about Linux kernel booting process, like load address randomization and etc. If you have any questions or suggestions write me a comment or ping me in [twitter](https://twitter.com/0xAX). -**Please note that English is not my first language, And I am really sorry for any inconvenience. If you find any mistakes please send me PR to [linux-insides](https://github.com/0xAX/linux-internals).** +**Please note that English is not my first language, And I am really sorry for any inconvenience. If you find any mistakes please send me PR to [linux-insides](https://github.com/0xAX/linux-insides).** Links -------------------------------------------------------------------------------- diff --git a/Booting/linux-bootstrap-6.md b/Booting/linux-bootstrap-6.md index 6bea71d7..f6f8f3d2 100644 --- a/Booting/linux-bootstrap-6.md +++ b/Booting/linux-bootstrap-6.md @@ -46,7 +46,7 @@ This function takes five parameters: * `input`; * `input_size`; * `output`; - * `output_isze`; + * `output_size`; * `virt_addr`. Let's try to understand what these parameters are. The first parameter, `input` is just the `input_data` parameter of the `extract_kernel` function from the [arch/x86/boot/compressed/misc.c](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/compressed/misc.c) source code file, cast to `unsigned long`: @@ -91,7 +91,7 @@ input_data: input_data_end: ``` -As you can see, it contains four global symbols. The first two, `z_input_len` and `z_output_len` are the sizes of the compressed and uncompressed `vmlinux.bin.gz` archive. The third is our `input_data` parameter which points to the linux kernel image's raw binary (stripped of all debugging symbols, comments and relocation information). The last parameter, `input_data_end`, points to the end of the compressed linux image. +As you can see, it contains four global symbols. The first two, `z_input_len` and `z_output_len` are the sizes of the compressed and uncompressed `vmlinux.bin.gz` archive. The third is our `input_data` parameter which points to the Linux kernel image's raw binary (stripped of all debugging symbols, comments and relocation information). The last parameter, `input_data_end`, points to the end of the compressed linux image. So, the first parameter to the `choose_random_location` function is the pointer to the compressed kernel image that is embedded into the `piggy.o` object file. @@ -146,7 +146,7 @@ Now, we call another function: initialize_identity_maps(); ``` -The `initialize_identity_maps` function is defined in the [arch/x86/boot/compressed/kaslr_64.c](https://github.com/torvalds/linux/blob/master/arch/x86/boot/compressed/kaslr_64.c) source code file. This function starts by initialising an instance of the `x86_mapping_info` structure called `mapping_info`: +The `initialize_identity_maps` function is defined in the [arch/x86/boot/compressed/kaslr_64.c](https://github.com/torvalds/linux/blob/master/arch/x86/boot/compressed/kaslr_64.c) source code file. This function starts by initializing an instance of the `x86_mapping_info` structure called `mapping_info`: ```C mapping_info.alloc_pgt_page = alloc_pgt_page; @@ -254,7 +254,7 @@ add_identity_map(mem_avoid[MEM_AVOID_ZO_RANGE].start, mem_avoid[MEM_AVOID_ZO_RANGE].size); ``` -THe `mem_avoid_init` function first tries to avoid memory regions currently used to decompress the kernel. We fill an entry from the `mem_avoid` array with the start address and the size of the relevant region and call the `add_identity_map` function, which builds the identity mapped pages for this region. The `add_identity_map` function is defined in the [arch/x86/boot/compressed/kaslr_64.c](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/compressed/kaslr_64.c) source code file and looks like this: +The `mem_avoid_init` function first tries to avoid memory regions currently used to decompress the kernel. We fill an entry from the `mem_avoid` array with the start address and the size of the relevant region and call the `add_identity_map` function, which builds the identity mapped pages for this region. The `add_identity_map` function is defined in the [arch/x86/boot/compressed/kaslr_64.c](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/compressed/kaslr_64.c) source code file and looks like this: ```C void add_identity_map(unsigned long start, unsigned long size) @@ -395,13 +395,13 @@ That's all. Conclusion -------------------------------------------------------------------------------- -This is the end of the sixth and last part concerning the linux kernel's booting process. We will not see any more posts about kernel booting (though there may be updates to this and previous posts). We will now turn to other parts of the linux kernel instead. +This is the end of the sixth and last part concerning the Linux kernel's booting process. We will not see any more posts about kernel booting (though there may be updates to this and previous posts). We will now turn to other parts of the linux kernel instead. The next chapter will be about kernel initialization and we will study the first steps take in the Linux kernel initialization code. If you have any questions or suggestions write me a comment or ping me in [twitter](https://twitter.com/0xAX). -**Please note that English is not my first language, And I am really sorry for any inconvenience. If you find any mistakes please send me PR to [linux-insides](https://github.com/0xAX/linux-internals).** +**Please note that English is not my first language, And I am really sorry for any inconvenience. If you find any mistakes please send me PR to [linux-insides](https://github.com/0xAX/linux-insides).** Links -------------------------------------------------------------------------------- diff --git a/CODEOWNERS b/CODEOWNERS new file mode 100644 index 00000000..9b2e7c87 --- /dev/null +++ b/CODEOWNERS @@ -0,0 +1,5 @@ +# Owner of the repository +* @0xAX + +# Documentation owners +*.md @0xAX @klaudiagrz diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 00000000..b160fe62 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,128 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +We as members, contributors, and leaders pledge to make participation in our +community a harassment-free experience for everyone, regardless of age, body +size, visible or invisible disability, ethnicity, sex characteristics, gender +identity and expression, level of experience, education, socio-economic status, +nationality, personal appearance, race, religion, or sexual identity +and orientation. + +We pledge to act and interact in ways that contribute to an open, welcoming, +diverse, inclusive, and healthy community. + +## Our Standards + +Examples of behavior that contributes to a positive environment for our +community include: + +* Demonstrating empathy and kindness toward other people +* Being respectful of differing opinions, viewpoints, and experiences +* Giving and gracefully accepting constructive feedback +* Accepting responsibility and apologizing to those affected by our mistakes, + and learning from the experience +* Focusing on what is best not just for us as individuals, but for the + overall community + +Examples of unacceptable behavior include: + +* The use of sexualized language or imagery, and sexual attention or + advances of any kind +* Trolling, insulting or derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or email + address, without their explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Enforcement Responsibilities + +Community leaders are responsible for clarifying and enforcing our standards of +acceptable behavior and will take appropriate and fair corrective action in +response to any behavior that they deem inappropriate, threatening, offensive, +or harmful. + +Community leaders have the right and responsibility to remove, edit, or reject +comments, commits, code, wiki edits, issues, and other contributions that are +not aligned to this Code of Conduct, and will communicate reasons for moderation +decisions when appropriate. + +## Scope + +This Code of Conduct applies within all community spaces, and also applies when +an individual is officially representing the community in public spaces. +Examples of representing our community include using an official e-mail address, +posting via an official social media account, or acting as an appointed +representative at an online or offline event. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported to the community leaders responsible for enforcement at +kuleshovmail@gmail.com. +All complaints will be reviewed and investigated promptly and fairly. + +All community leaders are obligated to respect the privacy and security of the +reporter of any incident. + +## Enforcement Guidelines + +Community leaders will follow these Community Impact Guidelines in determining +the consequences for any action they deem in violation of this Code of Conduct: + +### 1. Correction + +**Community Impact**: Use of inappropriate language or other behavior deemed +unprofessional or unwelcome in the community. + +**Consequence**: A private, written warning from community leaders, providing +clarity around the nature of the violation and an explanation of why the +behavior was inappropriate. A public apology may be requested. + +### 2. Warning + +**Community Impact**: A violation through a single incident or series +of actions. + +**Consequence**: A warning with consequences for continued behavior. No +interaction with the people involved, including unsolicited interaction with +those enforcing the Code of Conduct, for a specified period of time. This +includes avoiding interactions in community spaces as well as external channels +like social media. Violating these terms may lead to a temporary or +permanent ban. + +### 3. Temporary Ban + +**Community Impact**: A serious violation of community standards, including +sustained inappropriate behavior. + +**Consequence**: A temporary ban from any sort of interaction or public +communication with the community for a specified period of time. No public or +private interaction with the people involved, including unsolicited interaction +with those enforcing the Code of Conduct, is allowed during this period. +Violating these terms may lead to a permanent ban. + +### 4. Permanent Ban + +**Community Impact**: Demonstrating a pattern of violation of community +standards, including sustained inappropriate behavior, harassment of an +individual, or aggression toward or disparagement of classes of individuals. + +**Consequence**: A permanent ban from any sort of public interaction within +the community. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], +version 2.0, available at +https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. + +Community Impact Guidelines were inspired by [Mozilla's code of conduct +enforcement ladder](https://github.com/mozilla/diversity). + +[homepage]: https://www.contributor-covenant.org + +For answers to common questions about this code of conduct, see the FAQ at +https://www.contributor-covenant.org/faq. Translations are available at +https://www.contributor-covenant.org/translations. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 9caf3295..8dc4752d 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,33 +1,45 @@ -Contributing -================================================================================ +# Contributing -If you want to contribute to [linux-insides](https://github.com/0xAX/linux-insides), please follow these simple rules: +This document outlines the contribution workflow, starting from opening an issue, creating a pull request (PR), reviewing, and merging the PR. When working on this project, make sure to follow the [Code of Conduct](./CODE_OF_CONDUCT.md). -1. Press the fork button: +Thank you for your contribution. - ![fork](https://docs.github.com/assets/images/help/repository/fork_button.jpg) +## New contributor guide -2. Clone the repository from your account with: +If you are a new open source contributor, here are some resources you may find useful before providing your first contributions: - ``` - git clone git@github.com:your_github_username/linux-insides.git - ``` +- [Finding ways to contribute to open source on GitHub](https://docs.github.com/en/get-started/exploring-projects-on-github/finding-ways-to-contribute-to-open-source-on-github) +- [Set up Git](https://docs.github.com/en/get-started/getting-started-with-git/set-up-git) +- [GitHub flow](https://docs.github.com/en/get-started/using-github/github-flow) +- [Collaborating with pull requests](https://docs.github.com/en/github/collaborating-with-pull-requests) -3. Create a new branch with: +**Working on your first pull request?** You can learn how from this free series [How to Contribute to an Open Source Project on GitHub](https://kcd.im/pull-request). - ``` - git checkout -b "linux-bootstrap-1-fix" - ``` - You can name it however you want. +## Create an issue -4. Make your changes. +If you have any improvement ideas, notice a missing feature or a bug, create a GitHub issue by clicking **Issues -> New issue** in GitHub. Make sure to fill the issue template with a detailed description of the bug or suggested improvements. Provide proper argumentation and screenshots, if necessary. -5. Don't forget to add yourself in `contributors.md`. +If you find any existing issue to work on, you are welcome to open a PR with a fix. -6. Commit and push your changes, then make a pull request from Github. +## Open a pull request -**IMPORTANT** +If you want to directly contribute to the project, create a pull reguest with the suggested changes. To do so: -Please, don't forget to update your fork. While you made your changes, the content of the `master` branch can change because other pull requests were merged and it can create conflicts. This is why you have to rebase on `master` every time before pushing your changes and check that your branch doesn't have any conflicts with `master`. +1. [Fork the repository](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/fork-a-repo#fork-an-example-repository). -Thank you. +2. Make changes on your local copy of the forked repository. + +3. Commit and push the changes to GitHub. + +> [!IMPORTANT] +> Don't forget to update your fork. Since many contributors may be working on the same content based on the `master` branch, some merge conflicts may occur. Remember to rebase with `master` every time before pushing your changes and make sure your branch doesn't have any conflicts with `master`. If you run into any merge conflicts, read the [Resolve merge conflicts](https://github.com/skills/resolve-merge-conflicts) tutorial to learn how to resolve merge conflicts and other issues. + +4. Open a pull request in GitHub. Fill the pull request template with the reason and description for the provided changes. Link your pull request with the existing issue, if applicable. After submitting your PR, wait for the review from the project maintainers. + +## Review and approval process + +After you submit your PR, wait for the review. The project maintainers will evaluate your changes and provide feedback either using [suggested changes](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/incorporating-feedback-in-your-pull-request) or pull request comments. Address the review suggestions and comments as soon as you can. If your PR looks good, the maintainers approve and merge it. + +## Contributors + +All contributions get credit in [Contributors](contributors.md). Don't forget to add yourself there. diff --git a/Cgroups/images/menuconfig.png b/Cgroups/images/menuconfig.png index 8e2324a1..13f1d5d5 100644 Binary files a/Cgroups/images/menuconfig.png and b/Cgroups/images/menuconfig.png differ diff --git a/Cgroups/linux-cgroups-1.md b/Cgroups/linux-cgroups-1.md index 2aa1167b..79319ad0 100644 --- a/Cgroups/linux-cgroups-1.md +++ b/Cgroups/linux-cgroups-1.md @@ -30,7 +30,7 @@ Each of these control group subsystems depends on related configuration option. You may see enabled control groups on your computer via [proc](https://en.wikipedia.org/wiki/Procfs) filesystem: ``` -$ cat /proc/cgroups +$ cat /proc/cgroups #subsys_name hierarchy num_cgroups enabled cpuset 8 1 1 cpu 7 66 1 @@ -90,7 +90,7 @@ So, if we will run this script we will see following result: ``` $ sudo chmod +x cgroup_test_script.sh -~$ ./cgroup_test_script.sh +~$ ./cgroup_test_script.sh print line print line print line @@ -147,7 +147,7 @@ crw-rw-rw- 1 root tty 5, 0 Dec 3 22:48 /dev/tty see the first `c` letter in a permissions list. The second part is `5:0` is major and minor numbers of the device. You can see these numbers in the output of `ls` too. And the last `w` letter forbids tasks to write to the specified device. So let's start the `cgroup_test_script.sh` script: ``` -~$ ./cgroup_test_script.sh +~$ ./cgroup_test_script.sh print line print line print line @@ -164,7 +164,7 @@ and add pid of this process to the `devices/tasks` file of our group: The result of this action will be as expected: ``` -~$ ./cgroup_test_script.sh +~$ ./cgroup_test_script.sh print line print line print line @@ -174,7 +174,7 @@ print line ./cgroup_test_script.sh: line 5: /dev/tty: Operation not permitted ``` -Similar situation will be when you will run you [docker](https://en.wikipedia.org/wiki/Docker_\(software\)) containers for example: +Similar situation will be when you will run you [docker](https://en.wikipedia.org/wiki/Docker_(software)) containers for example: ``` ~$ docker ps @@ -213,7 +213,7 @@ Control group /: │ └─6404 /bin/bash ``` -Now we know a little about `control groups` mechanism, how to use it manually and what's purpose of this mechanism. It's time to look inside of the Linux kernel source code and start to dive into implementation of this mechanism. +Now we know a little about `control groups` mechanism, how to use it manually and what's the purpose of this mechanism. It's time to look inside of the Linux kernel source code and start to dive into implementation of this mechanism. Early initialization of control groups -------------------------------------------------------------------------------- @@ -294,7 +294,7 @@ Here we may see call of the `init_cgroup_root` function which will execute initi struct cgroup_root cgrp_dfl_root; ``` -Its `cgrp` field represented by the `cgroup` structure which represents a `cgroup` as you already may guess and defined in the [include/linux/cgroup-defs.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/cgroup-defs.h) header file. We already know that a process which is represented by the `task_struct` in the Linux kernel. The `task_struct` does not contain direct link to a `cgroup` where this task is attached. But it may be reached via `css_set` field of the `task_struct`. This `css_set` structure holds pointer to the array of subsystem states: +Its `cgrp` field represented by the `cgroup` structure which represents a `cgroup` as you already may guess and defined in the [include/linux/cgroup-defs.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/cgroup-defs.h) header file. We already know that a process is represented by the `task_struct` in the Linux kernel. The `task_struct` does not contain direct link to a `cgroup` where this task is attached. But it may be reached via `css_set` field of the `task_struct`. This `css_set` structure holds pointer to the array of subsystem states: ```C struct css_set { @@ -324,14 +324,14 @@ struct cgroup_subsys_state { So, the overall picture of `cgroups` related data structure is following: -``` +``` +-------------+ +---------------------+ +------------->+---------------------+ +----------------+ | task_struct | | css_set | | | cgroup_subsys_state | | cgroup | +-------------+ | | | +---------------------+ +----------------+ | | | | | | | | flags | | | | | | +---------------------+ | cgroup.procs | | | | | | | cgroup |--------->| id | -| | | | | +---------------------+ | .... | +| | | | | +---------------------+ | .... | |-------------+ |---------------------+----+ +----------------+ | cgroups | ------> | cgroup_subsys_state | array of cgroup_subsys_state |-------------+ +---------------------+------------------>+---------------------+ +----------------+ diff --git a/Concepts/linux-cpu-2.md b/Concepts/linux-cpu-2.md index 2ccc951b..59c00568 100644 --- a/Concepts/linux-cpu-2.md +++ b/Concepts/linux-cpu-2.md @@ -19,13 +19,13 @@ set_cpu_present(cpu, true); set_cpu_possible(cpu, true); ``` -Before we will consider implementation of these functions, let's consider all of these masks. +Before we consider implementation of these functions, let's consider all of these masks. -The `cpu_possible` is a set of cpu ID's which can be plugged in anytime during the life of that system boot or in other words mask of possible CPUs contains maximum number of CPUs which are possible in the system. It will be equal to value of the `NR_CPUS` which is which is set statically via the `CONFIG_NR_CPUS` kernel configuration option. +The `cpu_possible` is a set of cpu ID's which can be plugged in anytime during the life of that system boot or in other words mask of possible CPUs contains maximum number of CPUs which are possible in the system. It will be equal to value of the `NR_CPUS` which is set statically via the `CONFIG_NR_CPUS` kernel configuration option. The `cpu_present` mask represents which CPUs are currently plugged in. -The `cpu_online` represents a subset of the `cpu_present` and indicates CPUs which are available for scheduling or in other words a bit from this mask tells to kernel is a processor may be utilized by the Linux kernel. +The `cpu_online` represents a subset of the `cpu_present` and indicates CPUs which are available for scheduling or in other words a bit from this mask tells the kernel if a processor may be utilized by the Linux kernel. The last mask is `cpu_active`. Bits of this mask tells to Linux kernel is a task may be moved to a certain processor. @@ -94,9 +94,9 @@ And returns `1` every time. We need it here for only one purpose: at compile tim cpumask API -------------------------------------------------------------------------------- -As we can define cpumask with one of the method, Linux kernel provides API for manipulating a cpumask. Let's consider one of the function which presented above. For example `set_cpu_online`. This function takes two parameters: +As we can define cpumask with one of the methods, Linux kernel provides API for manipulating a cpumask. Let's consider one of the function which presented above. For example `set_cpu_online`. This function takes two parameters: -* Number of CPU; +* Index of CPU; * CPU status; Implementation of this function looks as: @@ -113,7 +113,7 @@ void set_cpu_online(unsigned int cpu, bool online) } ``` -First of all it checks the second `state` parameter and calls `cpumask_set_cpu` or `cpumask_clear_cpu` depends on it. Here we can see casting to the `struct cpumask *` of the second parameter in the `cpumask_set_cpu`. In our case it is `cpu_online_bits` which is a bitmap and defined as: +First of all it checks the second `state` parameter and calls `cpumask_set_cpu` or `cpumask_clear_cpu` depending on it. Here we can see casting to the `struct cpumask *` of the second parameter in the `cpumask_set_cpu`. In our case it is `cpu_online_bits` which is a bitmap and defined as: ```C static DECLARE_BITMAP(cpu_online_bits, CONFIG_NR_CPUS) __read_mostly; @@ -128,7 +128,7 @@ static inline void cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp) } ``` -The `set_bit` function takes two parameters too, and sets a given bit (first parameter) in the memory (second parameter or `cpu_online_bits` bitmap). We can see here that before `set_bit` will be called, its two parameters will be passed to the +The `set_bit` function takes two parameters too, and sets a given bit (first parameter) in the memory (second parameter or `cpu_online_bits` bitmap). We can see here that before `set_bit` is called, its two parameters will be passed to the * cpumask_check; * cpumask_bits. diff --git a/Concepts/linux-cpu-4.md b/Concepts/linux-cpu-4.md index 45dfe62e..b7e7e588 100644 --- a/Concepts/linux-cpu-4.md +++ b/Concepts/linux-cpu-4.md @@ -6,12 +6,12 @@ Introduction The Linux kernel is huge piece of [C](https://en.wikipedia.org/wiki/C_%28programming_language%29) code which consists from many different subsystems. Each subsystem has its own purpose which is independent of other subsystems. But often one subsystem wants to know something from other subsystem(s). There is special mechanism in the Linux kernel which allows to solve this problem partly. The name of this mechanism is - `notification chains` and its main purpose to provide a way for different subsystems to subscribe on asynchronous events from other subsystems. Note that this mechanism is only for communication inside kernel, but there are other mechanisms for communication between kernel and userspace. -Before we will consider `notification chains` [API](https://en.wikipedia.org/wiki/Application_programming_interface) and implementation of this API, let's look at `Notification chains` mechanism from theoretical side as we did it in other parts of this book. Everything which is related to `notification chains` mechanism is located in the [include/linux/notifier.h](https://github.com/torvalds/linux/blob/master/include/linux/notifier.h) header file and [kernel/notifier.c](https://github.com/torvalds/linux/blob/master/kernel/notifier.c) source code file. So let's open them and start to dive. +Before we consider `notification chains` [API](https://en.wikipedia.org/wiki/Application_programming_interface) and implementation of this API, let's look at `Notification chains` mechanism from theoretical side as we did it in other parts of this book. Everything which is related to `notification chains` mechanism is located in the [include/linux/notifier.h](https://github.com/torvalds/linux/blob/master/include/linux/notifier.h) header file and [kernel/notifier.c](https://github.com/torvalds/linux/blob/master/kernel/notifier.c) source code file. So let's open them and start to dive. Notification Chains related data structures -------------------------------------------------------------------------------- -Let's start to consider `notification chains` mechanism from related data structures. As I wrote above, main data structures should be located in the [include/linux/notifier.h](https://github.com/torvalds/linux/blob/master/include/linux/notifier.h) header file, so the Linux kernel provides generic API which does not depend on certain architecture. In general, the `notification chains` mechanism represents a list (that's why it named `chains`) of [callback](https://en.wikipedia.org/wiki/Callback_%28computer_programming%29) functions which are will be executed when an event will be occurred. +Let's start to consider `notification chains` mechanism from related data structures. As I wrote above, main data structures should be located in the [include/linux/notifier.h](https://github.com/torvalds/linux/blob/master/include/linux/notifier.h) header file, so the Linux kernel provides generic API which does not depend on certain architecture. In general, the `notification chains` mechanism represents a list (that's why it's named `chains`) of [callback](https://en.wikipedia.org/wiki/Callback_%28computer_programming%29) functions which are will be executed when an event will be occurred. All of these callback functions are represented as `notifier_fn_t` type in the Linux kernel: @@ -101,7 +101,7 @@ Now as we know a little about `notification chains` mechanism let's consider imp Notification Chains -------------------------------------------------------------------------------- -Usually there are two sides in a publish/subscriber mechanisms. One side who wants to get notifications and other side(s) who generates these notifications. We will consider notification chains mechanism from both sides. We will consider `blocking notification chains` in this part, because of other types of notification chains are similar to it and differs mostly in protection mechanisms. +Usually there are two sides in a publish/subscriber mechanisms. One side who wants to get notifications and other side(s) who generates these notifications. We will consider notification chains mechanism from both sides. We will consider `blocking notification chains` in this part, because of other types of notification chains are similar to it and differ mostly in protection mechanisms. Before a notification producer is able to produce notification, first of all it should initialize head of a notification chain. For example let's consider notification chains related to kernel [loadable modules](https://en.wikipedia.org/wiki/Loadable_kernel_module). If we will look in the [kernel/module.c](https://github.com/torvalds/linux/blob/master/kernel/module.c) source code file, we will see following definition: @@ -120,7 +120,7 @@ which defines head for loadable modules blocking notifier chain. The `BLOCKING_N So we may see that it takes name of a name of a head of a blocking notifier chain and initializes read/write [semaphore](https://0xax.gitbook.io/linux-insides/summary/syncprim/linux-sync-3) and set head to `NULL`. Besides the `BLOCKING_INIT_NOTIFIER_HEAD` macro, the Linux kernel additionally provides `ATOMIC_INIT_NOTIFIER_HEAD`, `RAW_INIT_NOTIFIER_HEAD` macros and `srcu_init_notifier` function for initialization atomic and other types of notification chains. -After initialization of a head of a notification chain, a subsystem which wants to receive notification from the given notification chain it should register with certain function which is depends on type of notification. If you will look in the [include/linux/notifier.h](https://github.com/torvalds/linux/blob/master/include/linux/notifier.h) header file, you will see following four function for this: +After initialization of a head of a notification chain, a subsystem which wants to receive notification from the given notification chain should register with certain function which depends on the type of notification. If you will look in the [include/linux/notifier.h](https://github.com/torvalds/linux/blob/master/include/linux/notifier.h) header file, you will see following four function for this: ```C extern int atomic_notifier_chain_register(struct atomic_notifier_head *nh, @@ -247,7 +247,7 @@ int __blocking_notifier_call_chain(struct blocking_notifier_head *nh, } ``` -Where `nr_to_call` and `nr_calls` are number of notifier functions to be called and number of sent notifications. As you may guess the main goal of the `__blocking_notifer_call_chain` function and other functions for other notification types is to call callback function when an event occurred. Implementation of the `__blocking_notifier_call_chain` is pretty simple, it just calls the `notifier_call_chain` function from the same source code file protected with read/write semaphore: +Where `nr_to_call` and `nr_calls` are number of notifier functions to be called and number of sent notifications. As you may guess the main goal of the `__blocking_notifer_call_chain` function and other functions for other notification types is to call callback function when an event occurs. Implementation of the `__blocking_notifier_call_chain` is pretty simple, it just calls the `notifier_call_chain` function from the same source code file protected with read/write semaphore: ```C int __blocking_notifier_call_chain(struct blocking_notifier_head *nh, @@ -266,7 +266,7 @@ int __blocking_notifier_call_chain(struct blocking_notifier_head *nh, } ``` -and returns its result. In this case all job is done by the `notifier_call_chain` function. Main purpose of this function informs registered notifiers about an asynchronous event: +and returns its result. In this case all job is done by the `notifier_call_chain` function. Main purpose of this function is to inform registered notifiers about an asynchronous event: ```C static int notifier_call_chain(struct notifier_block **nl, @@ -298,7 +298,7 @@ definition of the `module_notify_list` in the [kernel/module.c](https://github.c * MODULE_STATE_COMING * MODULE_STATE_GOING -in which maybe interested some subsystems of the Linux kernel. For example tracing of kernel modules states. Instead of direct call of the `atomic_notifier_chain_register`, `blocking_notifier_chain_register` and etc., most notification chains come with a set of wrappers used to register to them. Registatrion on these modules events is going with the help of such wrapper: +in which maybe interested some subsystems of the Linux kernel. For example tracing of kernel modules states. Instead of direct call of the `atomic_notifier_chain_register`, `blocking_notifier_chain_register` and etc., most notification chains come with a set of wrappers used to register to them. Registration on these modules events is going with the help of such wrapper: ```C int register_module_notifier(struct notifier_block *nb) @@ -348,7 +348,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, } ``` -Thus when one of these system call will be called from userspace, the Linux kernel will send certain notification depends on a system call and the `tracepoint_module_notify` callback function will be called. +Thus when one of these system call will be called from userspace, the Linux kernel will send certain notification depending on a system call and the `tracepoint_module_notify` callback function will be called. That's all. diff --git a/DataStructures/linux-datastructures-2.md b/DataStructures/linux-datastructures-2.md index cd6b90fe..58c3e2ad 100644 --- a/DataStructures/linux-datastructures-2.md +++ b/DataStructures/linux-datastructures-2.md @@ -4,7 +4,7 @@ Data Structures in the Linux Kernel Radix tree -------------------------------------------------------------------------------- -As you already know linux kernel provides many different libraries and functions which implement different data structures and algorithms. In this part we will consider one of these data structures - [Radix tree](http://en.wikipedia.org/wiki/Radix_tree). There are two files which are related to `radix tree` implementation and API in the linux kernel: +As you already know Linux kernel provides many different libraries and functions which implement different data structures and algorithms. In this part we will consider one of these data structures - [Radix tree](http://en.wikipedia.org/wiki/Radix_tree). There are two files which are related to `radix tree` implementation and API in the linux kernel: * [include/linux/radix-tree.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/radix-tree.h) * [lib/radix-tree.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/lib/radix-tree.c) @@ -43,7 +43,7 @@ Lets talk about what a `radix tree` is. Radix tree is a `compressed trie` where So in this example, we can see the `trie` with keys, `go` and `cat`. The compressed trie or `radix tree` differs from `trie` in that all intermediates nodes which have only one child are removed. -Radix tree in linux kernel is the data structure which maps values to integer keys. It is represented by the following structures from the file [include/linux/radix-tree.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/radix-tree.h): +Radix tree in Linux kernel is the data structure which maps values to integer keys. It is represented by the following structures from the file [include/linux/radix-tree.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/radix-tree.h): ```C struct radix_tree_root { @@ -98,7 +98,7 @@ This structure contains information about the offset in a parent and height from * `rcu_head` - used for freeing a node; * `private_list` - used by the user of a tree; -The two last fields of the `radix_tree_node` - `tags` and `slots` are important and interesting. Every node can contains a set of slots which are store pointers to the data. Empty slots in the linux kernel radix tree implementation store `NULL`. Radix trees in the linux kernel also supports tags which are associated with the `tags` fields in the `radix_tree_node` structure. Tags allow individual bits to be set on records which are stored in the radix tree. +The two last fields of the `radix_tree_node` - `tags` and `slots` are important and interesting. Every node can contains a set of slots which are store pointers to the data. Empty slots in the Linux kernel radix tree implementation store `NULL`. Radix trees in the linux kernel also supports tags which are associated with the `tags` fields in the `radix_tree_node` structure. Tags allow individual bits to be set on records which are stored in the radix tree. Now that we know about radix tree structure, it is time to look on its API. diff --git a/DataStructures/linux-datastructures-3.md b/DataStructures/linux-datastructures-3.md index c7015f65..ffca3f26 100644 --- a/DataStructures/linux-datastructures-3.md +++ b/DataStructures/linux-datastructures-3.md @@ -9,11 +9,11 @@ Besides different [linked](https://en.wikipedia.org/wiki/Linked_data_structure) * [lib/bitmap.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/lib/bitmap.c) * [include/linux/bitmap.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/bitmap.h) -Besides these two files, there is also architecture-specific header file which provides optimized bit operations for certain architecture. We consider [x86_64](https://en.wikipedia.org/wiki/X86-64) architecture, so in our case it will be: +Besides these two files, there is also architecture-specific header file which provides optimized bit operations for certain architecture. We consider [x86_64](https://en.wikipedia.org/wiki/X86-64) architecture, so in our case it will be: * [arch/x86/include/asm/bitops.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/bitops.h) -header file. As I just wrote above, the `bitmap` is heavily used in the Linux kernel. For example a `bit array` is used to store set of online/offline processors for systems which support [hot-plug](https://www.kernel.org/doc/Documentation/cpu-hotplug.txt) cpu (more about this you can read in the [cpumasks](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-2) part), a `bit array` stores set of allocated [irqs](https://en.wikipedia.org/wiki/Interrupt_request_%28PC_architecture%29) during initialization of the Linux kernel and etc. +header file. As I just wrote above, the `bitmap` is heavily used in the Linux kernel. For example a `bit array` is used to store set of online/offline processors for systems which support [hot-plug](https://www.kernel.org/doc/Documentation/cpu-hotplug.txt) CPU (more about this you can read in the [cpumasks](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-2) part), a `bit array` stores set of allocated [IRQs](https://en.wikipedia.org/wiki/Interrupt_request_%28PC_architecture%29) during initialization of the Linux kernel and etc. So, the main goal of this part is to see how `bit arrays` are implemented in the Linux kernel. Let's start. @@ -75,7 +75,7 @@ I think that there is no need to explain what these function do. This is already In simple words atomic operations guarantees that two or more operations will not be performed on the same data concurrently. The `x86` architecture provides a set of atomic instructions, for example [xchg](http://x86.renejeschke.de/html/file_module_x86_id_328.html) instruction, [cmpxchg](http://x86.renejeschke.de/html/file_module_x86_id_41.html) instruction and etc. Besides atomic instructions, some of non-atomic instructions can be made atomic with the help of the [lock](http://x86.renejeschke.de/html/file_module_x86_id_159.html) instruction. It is enough to know about atomic operations for now, so we can begin to consider implementation of `set_bit` and `clear_bit` functions. -First of all, let's start to consider `non-atomic` variants of this function. Names of non-atomic `set_bit` and `clear_bit` starts from double underscore. As we already know, all of these functions are defined in the [arch/x86/include/asm/bitops.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/bitops.h) header file and the first function is `__set_bit`: +First of all, let's start to consider `non-atomic` variants of this function. Names of non-atomic `set_bit` and `clear_bit` starts with double underscore. As we already know, all of these functions are defined in the [arch/x86/include/asm/bitops.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/bitops.h) header file and the first function is `__set_bit`: ```C static inline void __set_bit(long nr, volatile unsigned long *addr) @@ -134,13 +134,13 @@ which means that this function will be always inlined to reduce size of the Linu #define IS_IMMEDIATE(nr) (__builtin_constant_p(nr)) ``` -The `__builtin_constant_p` builtin function returns `1` if the given parameter is known to be constant at compile-time and returns `0` in other case. We no need to use slow `bts` instruction to set bit if the given number of bit is known in compile time constant. We can just apply [bitwise or](https://en.wikipedia.org/wiki/Bitwise_operation#OR) for byte from the give address which contains given bit and masked number of bits where high bit is `1` and other is zero. In other case if the given number of bit is not known constant at compile-time, we do the same as we did in the `__set_bit` function. The `CONST_MASK_ADDR` macro: +The `__builtin_constant_p` builtin function returns `1` if the given parameter is known to be constant at compile-time and returns `0` in other case. We do not need to use slow `bts` instruction to set bit if the given number of bit is known in compile time constant. We can just apply [bitwise or](https://en.wikipedia.org/wiki/Bitwise_operation#OR) for byte from the give address which contains given bit and masked number of bits where high bit is `1` and other is zero. In other case if the given number of bit is not known constant at compile-time, we do the same as we did in the `__set_bit` function. The `CONST_MASK_ADDR` macro: ```C #define CONST_MASK_ADDR(nr, addr) BITOP_ADDR((void *)(addr) + ((nr)>>3)) ``` -expands to the give address with offset to the byte which contains a given bit. For example we have address `0x1000` and the number of bit is `0x9`. So, as `0x9` is `one byte + one bit` our address with be `addr + 1`: +expands to the given address with offset to the byte which contains a given bit. For example we have address `0x1000` and the number of bit is `0x9`. So, as `0x9` is `one byte + one bit` our address with be `addr + 1`: ```python >>> hex(0x1000 + (0x9 >> 3)) @@ -204,7 +204,7 @@ and as we can see it is very similar on `set_bit` and just contains two differen That's all. Now we can set and clear bit in any bit array and and we can go to other operations on bitmasks. -Most widely used operations on a bit arrays are set and clear bit in a bit array in the Linux kernel. But besides this operations it is useful to do additional operations on a bit array. Yet another widely used operation in the Linux kernel - is to know is a given bit set or not in a bit array. We can achieve this with the help of the `test_bit` macro. This macro is defined in the [arch/x86/include/asm/bitops.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/bitops.h) header file and expands to the call of the `constant_test_bit` or `variable_test_bit` depends on bit number: +Most widely used operations on a bit arrays are set and clear bit in a bit array in the Linux kernel. But besides this operations it is useful to do additional operations on a bit array. Yet another widely used operation in the Linux kernel - is to know if a given bit is set or not in a bit array. We can achieve this with the help of the `test_bit` macro. This macro is defined in the [arch/x86/include/asm/bitops.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/bitops.h) header file and expands to the call of the `constant_test_bit` or `variable_test_bit` depending on bit number: ```C #define test_bit(nr, addr) \ @@ -257,7 +257,7 @@ static inline void __change_bit(long nr, volatile unsigned long *addr) } ``` -Pretty easy, is not it? The implementation of the `__change_bit` is the same as `__set_bit`, but instead of `bts` instruction, we are using [btc](http://x86.renejeschke.de/html/file_module_x86_id_23.html). This instruction selects a given bit from a given bit array, stores its value in the `CF` and changes its value by the applying of complement operation. So, a bit with value `1` will be `0` and vice versa: +Pretty easy, is it not? The implementation of the `__change_bit` is the same as `__set_bit`, but instead of `bts` instruction, we are using [btc](http://x86.renejeschke.de/html/file_module_x86_id_23.html). This instruction selects a given bit from a given bit array, stores its value in the `CF` and changes its value by the applying of complement operation. So, a bit with value `1` will be `0` and vice versa: ```python >>> int(not 1) @@ -290,7 +290,7 @@ For this moment we know the most important architecture-specific operations with Common bit operations ================================================================================ -Besides the architecture-specific API from the [arch/x86/include/asm/bitops.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/bitops.h) header file, the Linux kernel provides common API for manipulation of bit arrays. As we know from the beginning of this part, we can find it in the [include/linux/bitmap.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/bitmap.h) header file and additionally in the * [lib/bitmap.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/lib/bitmap.c) source code file. But before these source code files let's look into the [include/linux/bitops.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/bitops.h) header file which provides a set of useful macro. Let's look on some of they. +Besides the architecture-specific API from the [arch/x86/include/asm/bitops.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/bitops.h) header file, the Linux kernel provides common API for manipulation of bit arrays. As we know from the beginning of this part, we can find it in the [include/linux/bitmap.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/bitmap.h) header file and additionally in the [lib/bitmap.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/lib/bitmap.c) source code file. But before these source code files let's look into the [include/linux/bitops.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/bitops.h) header file which provides a set of useful macro. Let's look on some of them. First of all let's look at following four macros: @@ -317,7 +317,7 @@ The next [header](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2 * `bitmap_zero`; * `bitmap_fill`. -To clear a bit array and fill it with `1`. Let's look on the implementation of the `bitmap_zero` function: +To clear a bit array or fill it with `1`. Let's look at the implementation of the `bitmap_zero` function: ```C static inline void bitmap_zero(unsigned long *dst, unsigned int nbits) @@ -340,7 +340,7 @@ First of all we can see the check for `nbits`. The `small_const_nbits` is macro As we may see it checks that `nbits` is known constant in compile time and `nbits` value does not overflow `BITS_PER_LONG` or `64`. If bits number does not overflow amount of bits in a `long` value we can just set to zero. In other case we need to calculate how many `long` values do we need to fill our bit array and fill it with [memset](http://man7.org/linux/man-pages/man3/memset.3.html). -The implementation of the `bitmap_fill` function is similar on implementation of the `biramp_zero` function, except we fill a given bit array with `0xff` values or `0b11111111`: +The implementation of the `bitmap_fill` function is similar on implementation of the `bitmap_zero` function, except we fill a given bit array with `0xff` values or `0b11111111`: ```C static inline void bitmap_fill(unsigned long *dst, unsigned int nbits) @@ -354,7 +354,7 @@ static inline void bitmap_fill(unsigned long *dst, unsigned int nbits) } ``` -Besides the `bitmap_fill` and `bitmap_zero` functions, the [include/linux/bitmap.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/bitmap.h) header file provides `bitmap_copy` which is similar on the `bitmap_zero`, but just uses [memcpy](http://man7.org/linux/man-pages/man3/memcpy.3.html) instead of [memset](http://man7.org/linux/man-pages/man3/memset.3.html). Also it provides bitwise operations for bit array like `bitmap_and`, `bitmap_or`, `bitamp_xor` and etc. We will not consider implementation of these functions because it is easy to understand implementations of these functions if you understood all from this part. Anyway if you are interested how did these function implemented, you may open [include/linux/bitmap.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/bitmap.h) header file and start to research. +Besides the `bitmap_fill` and `bitmap_zero` functions, the [include/linux/bitmap.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/bitmap.h) header file provides `bitmap_copy` which is similar on the `bitmap_zero`, but just uses [memcpy](http://man7.org/linux/man-pages/man3/memcpy.3.html) instead of [memset](http://man7.org/linux/man-pages/man3/memset.3.html). Also it provides bitwise operations for bit array like `bitmap_and`, `bitmap_or`, `bitamp_xor` and etc. We will not consider implementation of these functions because it is easy to understand implementations of these functions if you understood all from this part. Anyway if you are interested in how these function are implemented, you may open [include/linux/bitmap.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/bitmap.h) header file and start to research. That's all. @@ -363,7 +363,7 @@ Links * [bitmap](https://en.wikipedia.org/wiki/Bit_array) * [linked data structures](https://en.wikipedia.org/wiki/Linked_data_structure) -* [tree data structures](https://en.wikipedia.org/wiki/Tree_%28data_structure%29) +* [tree data structures](https://en.wikipedia.org/wiki/Tree_%28data_structure%29) * [hot-plug](https://www.kernel.org/doc/Documentation/cpu-hotplug.txt) * [cpumasks](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-2) * [IRQs](https://en.wikipedia.org/wiki/Interrupt_request_%28PC_architecture%29) @@ -377,7 +377,7 @@ Links * [bt instruction](http://x86.renejeschke.de/html/file_module_x86_id_22.html) * [sbb instruction](http://x86.renejeschke.de/html/file_module_x86_id_286.html) * [btc instruction](http://x86.renejeschke.de/html/file_module_x86_id_23.html) -* [man memcpy](http://man7.org/linux/man-pages/man3/memcpy.3.html) +* [man memcpy](http://man7.org/linux/man-pages/man3/memcpy.3.html) * [man memset](http://man7.org/linux/man-pages/man3/memset.3.html) * [CF](https://en.wikipedia.org/wiki/FLAGS_register) * [inline assembler](https://en.wikipedia.org/wiki/Inline_assembler) diff --git a/Dockerfile b/Dockerfile index 48b772b3..bd0ae9ac 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,3 +1,10 @@ -FROM lrx0014/gitbook:3.2.3 +FROM kyselejsyrecek/gitbook:3.2.3 COPY ./ /srv/gitbook/ -EXPOSE 4000 \ No newline at end of file +EXPOSE 4000 +WORKDIR /srv/gitbook +CMD ["sh", "-c", "/usr/local/bin/gitbook serve"] + +# Examples: +#RUN gitbook pdf +#RUN gitbook epub + diff --git a/Initialization/README.md b/Initialization/README.md index da9dde69..6422b9e7 100644 --- a/Initialization/README.md +++ b/Initialization/README.md @@ -13,4 +13,4 @@ You will find here a couple of posts which describe the full cycle of kernel ini * [The End of the architecture-specific initializations, almost...](linux-initialization-7.md) - describes the end of the `setup_arch` related stuff. * [Scheduler initialization](linux-initialization-8.md) - describes preparation before scheduler initialization and initialization of it. * [RCU initialization](linux-initialization-9.md) - describes the initialization of the [RCU](http://en.wikipedia.org/wiki/Read-copy-update). -* [End of the initialization](linux-initialization-10.md) - the last part about linux kernel initialization. +* [End of the initialization](linux-initialization-10.md) - the last part about Linux kernel initialization. diff --git a/Initialization/images/CONFIG_NR_CPUS.png b/Initialization/images/CONFIG_NR_CPUS.png index 996f0293..5e36552b 100644 Binary files a/Initialization/images/CONFIG_NR_CPUS.png and b/Initialization/images/CONFIG_NR_CPUS.png differ diff --git a/Initialization/images/NX.png b/Initialization/images/NX.png index 059e26d7..9fdcd460 100644 Binary files a/Initialization/images/NX.png and b/Initialization/images/NX.png differ diff --git a/Initialization/images/brk_area.png b/Initialization/images/brk_area.png index e79b7d11..4a5ea54a 100644 Binary files a/Initialization/images/brk_area.png and b/Initialization/images/brk_area.png differ diff --git a/Initialization/images/kernel_command_line.png b/Initialization/images/kernel_command_line.png index e65f9b21..9ff8c7d0 100644 Binary files a/Initialization/images/kernel_command_line.png and b/Initialization/images/kernel_command_line.png differ diff --git a/Initialization/linux-initialization-1.md b/Initialization/linux-initialization-1.md index 598fba16..9ca18e74 100644 --- a/Initialization/linux-initialization-1.md +++ b/Initialization/linux-initialization-1.md @@ -674,7 +674,7 @@ The next step will be setup of the early `IDT` handlers, but it's big concept so Conclusion -------------------------------------------------------------------------------- -This is the end of the first part about linux kernel initialization. +This is the end of the first part about Linux kernel initialization. If you have questions or suggestions, feel free to ping me in twitter [0xAX](https://twitter.com/0xAX), drop me [email](mailto:anotherworldofworld@gmail.com) or just create [issue](https://github.com/0xAX/linux-insides/issues/new). diff --git a/Initialization/linux-initialization-10.md b/Initialization/linux-initialization-10.md index 3636737e..016f76cf 100644 --- a/Initialization/linux-initialization-10.md +++ b/Initialization/linux-initialization-10.md @@ -1,10 +1,10 @@ Kernel initialization. Part 10. ================================================================================ -End of the linux kernel initialization process +End of the Linux kernel initialization process ================================================================================ -This is tenth part of the chapter about linux kernel [initialization process](https://0xax.gitbook.io/linux-insides/summary/initialization) and in the [previous part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-9) we saw the initialization of the [RCU](http://en.wikipedia.org/wiki/Read-copy-update) and stopped on the call of the `acpi_early_init` function. This part will be the last part of the [Kernel initialization process](https://0xax.gitbook.io/linux-insides/summary/initialization) chapter, so let's finish it. +This is tenth part of the chapter about Linux kernel [initialization process](https://0xax.gitbook.io/linux-insides/summary/initialization) and in the [previous part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-9) we saw the initialization of the [RCU](http://en.wikipedia.org/wiki/Read-copy-update) and stopped on the call of the `acpi_early_init` function. This part will be the last part of the [Kernel initialization process](https://0xax.gitbook.io/linux-insides/summary/initialization) chapter, so let's finish it. After the call of the `acpi_early_init` function from the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c), we can see the following code: @@ -132,12 +132,12 @@ structure from the [include/uapi/linux/resource.h](https://github.com/torvalds/l ```C cat /proc/self/limits -Limit Soft Limit Hard Limit Units +Limit Soft Limit Hard Limit Units ... ... ... -Max processes 63815 63815 processes -Max pending signals 63815 63815 signals +Max processes 63815 63815 processes +Max pending signals 63815 63815 signals ... ... ... @@ -149,7 +149,7 @@ Initialization of the caches The next function after the `fork_init` is the `proc_caches_init` from the [kernel/fork.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/fork.c). This function allocates caches for the memory descriptors (or `mm_struct` structure). At the beginning of the `proc_caches_init` we can see allocation of the different [SLAB](http://en.wikipedia.org/wiki/Slab_allocation) caches with the call of the `kmem_cache_create`: * `sighand_cachep` - manage information about installed signal handlers; -* `signal_cachep` - manage information about process signal descriptor; +* `signal_cachep` - manage information about process signal descriptor; * `files_cachep` - manage information about opened files; * `fs_cachep` - manage filesystem information. @@ -185,7 +185,7 @@ nrpages = (nr_free_buffer_pages() * 10) / 100; max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head)); ``` -which will be equal to the `10%` of the `ZONE_NORMAL` (all RAM from the 4GB on the `x86_64`). The next function after the `buffer_init` is - `vfs_caches_init`. This function allocates `SLAB` caches and hashtable for different [VFS](http://en.wikipedia.org/wiki/Virtual_file_system) caches. We already saw the `vfs_caches_init_early` function in the eighth part of the linux kernel [initialization process](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-8) which initialized caches for `dcache` (or directory-cache) and [inode](http://en.wikipedia.org/wiki/Inode) cache. The `vfs_caches_init` function makes post-early initialization of the `dcache` and `inode` caches, private data cache, hash tables for the mount points, etc. More details about [VFS](http://en.wikipedia.org/wiki/Virtual_file_system) will be described in the separate part. After this we can see `signals_init` function. This function is defined in the [kernel/signal.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/signal.c) and allocates a cache for the `sigqueue` structures which represents queue of the real time signals. The next function is `page_writeback_init`. This function initializes the ratio for the dirty pages. Every low-level page entry contains the `dirty` bit which indicates whether a page has been written to after been loaded into memory. +which will be equal to the `10%` of the `ZONE_NORMAL` (all RAM from the 4GB on the `x86_64`). The next function after the `buffer_init` is - `vfs_caches_init`. This function allocates `SLAB` caches and hashtable for different [VFS](http://en.wikipedia.org/wiki/Virtual_file_system) caches. We already saw the `vfs_caches_init_early` function in the eighth part of the Linux kernel [initialization process](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-8) which initialized caches for `dcache` (or directory-cache) and [inode](http://en.wikipedia.org/wiki/Inode) cache. The `vfs_caches_init` function makes post-early initialization of the `dcache` and `inode` caches, private data cache, hash tables for the mount points, etc. More details about [VFS](http://en.wikipedia.org/wiki/Virtual_file_system) will be described in the separate part. After this we can see `signals_init` function. This function is defined in the [kernel/signal.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/signal.c) and allocates a cache for the `sigqueue` structures which represents queue of the real time signals. The next function is `page_writeback_init`. This function initializes the ratio for the dirty pages. Every low-level page entry contains the `dirty` bit which indicates whether a page has been written to after been loaded into memory. Creation of the root for the procfs -------------------------------------------------------------------------------- @@ -230,9 +230,9 @@ and a couple of directories depends on the different configuration options: In the end of the `proc_root_init` we call the `proc_sys_init` function which creates `/proc/sys` directory and initializes the [Sysctl](http://en.wikipedia.org/wiki/Sysctl). -It is the end of `start_kernel` function. I did not describe all functions which are called in the `start_kernel`. I skipped them, because they are not important for the generic kernel initialization stuff and depend on only different kernel configurations. They are `taskstats_init_early` which exports per-task statistic to the user-space, `delayacct_init` - initializes per-task delay accounting, `key_init` and `security_init` initialize different security stuff, `check_bugs` - fix some architecture-dependent bugs, `ftrace_init` function executes initialization of the [ftrace](https://www.kernel.org/doc/Documentation/trace/ftrace.txt), `cgroup_init` makes initialization of the rest of the [cgroup](http://en.wikipedia.org/wiki/Cgroups) subsystem,etc. Many of these parts and subsystems will be described in the other chapters. +It is the end of `start_kernel` function. I did not describe all functions which are called in the `start_kernel`. I skipped them, because they are not important for the generic kernel initialization stuff and depend on only different kernel configurations. They are `taskstats_init_early` which exports per-task statistic to the user-space, `delayacct_init` - initializes per-task delay accounting, `key_init` and `security_init` initialize different security stuff, `check_bugs` - fix some architecture-dependent bugs, `ftrace_init` function executes initialization of the [ftrace](https://www.kernel.org/doc/Documentation/trace/ftrace.txt), `cgroup_init` makes initialization of the rest of the [cgroup](http://en.wikipedia.org/wiki/Cgroups) subsystem, etc. Many of these parts and subsystems will be described in the other chapters. -That's all. Finally we have passed through the long-long `start_kernel` function. But it is not the end of the linux kernel initialization process. We haven't run the first process yet. In the end of the `start_kernel` we can see the last call of the - `rest_init` function. Let's go ahead. +That's all. Finally we have passed through the long-long `start_kernel` function. But it is not the end of the Linux kernel initialization process. We haven't run the first process yet. In the end of the `start_kernel` we can see the last call of the - `rest_init` function. Let's go ahead. First steps after the start_kernel -------------------------------------------------------------------------------- @@ -251,7 +251,7 @@ kernel_thread(kernel_init, NULL, CLONE_FS); pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES); ``` -Here the `kernel_thread` function (defined in the [kernel/fork.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/fork.c)) creates new kernel thread.As we can see the `kernel_thread` function takes three arguments: +Here the `kernel_thread` function (defined in the [kernel/fork.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/fork.c)) creates new kernel thread. As we can see the `kernel_thread` function takes three arguments: * Function which will be executed in a new thread; * Parameter for the `kernel_init` function; @@ -271,7 +271,7 @@ Let's postpone `kernel_init` and `kthreadd` for now and go ahead in the `rest_in kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns); rcu_read_unlock(); ``` - + The first `rcu_read_lock` function marks the beginning of an [RCU](http://en.wikipedia.org/wiki/Read-copy-update) read-side critical section and the `rcu_read_unlock` marks the end of an RCU read-side critical section. We call these functions because we need to protect the `find_task_by_pid_ns`. The `find_task_by_pid_ns` returns pointer to the `task_struct` by the given pid. So, here we are getting the pointer to the `task_struct` for `PID = 2` (we got it after `kthreadd` creation with the `kernel_thread`). In the next step we call `complete` function ```C @@ -314,7 +314,7 @@ void init_idle_bootup_task(struct task_struct *idle) } ``` -where `idle` class is a low task priority and tasks can be run only when the processor doesn't have anything to run besides this tasks. The second function `schedule_preempt_disabled` disables preempt in `idle` tasks. And the third function `cpu_startup_entry` is defined in the [kernel/sched/idle.c](https://github.com/torvalds/linux/blob/master/kernel/sched/idle.c) and calls `cpu_idle_loop` from the [kernel/sched/idle.c](https://github.com/torvalds/linux/blob/master/kernel/sched/idle.c). The `cpu_idle_loop` function works as process with `PID = 0` and works in the background. Main purpose of the `cpu_idle_loop` is to consume the idle CPU cycles. When there is no process to run, this process starts to work. We have one process with `idle` scheduling class (we just set the `current` task to the `idle` with the call of the `init_idle_bootup_task` function), so the `idle` thread does not do useful work but just checks if there is an active task to switch to: +where `idle` class is a low task priority and tasks can be run only when the processor doesn't have anything to run besides this tasks. The second function `schedule_preempt_disabled` disables preempt in `idle` tasks. And the third function `cpu_startup_entry` is defined in the [kernel/sched/idle.c](https://github.com/torvalds/linux/blob/master/kernel/sched/idle.c) and calls `cpu_idle_loop` from the [kernel/sched/idle.c](https://github.com/torvalds/linux/blob/master/kernel/sched/idle.c). The `cpu_idle_loop` function works as process with `PID = 0` and works in the background. Main purpose of the `cpu_idle_loop` is to consume the idle CPU cycles. When there is no process to run, this process starts to work. We have one process with `idle` scheduling class (we just set the `current` task to the `idle` with the call of the `init_idle_bootup_task` function), so the `idle` thread does not do useful work but just checks if there is an active task to switch to: ```C static void cpu_idle_loop(void) @@ -418,7 +418,7 @@ The `do_execve` function is defined in the [include/linux/sched.h](https://githu } ``` -If we did not pass `init=` kernel command line parameter either, kernel tries to run one of the following executable files: +If we did not pass `init=` kernel command line parameter either, kernel tries to run one of the following executable files: ```C if (!try_to_run_init_process("/sbin/init") || @@ -440,7 +440,7 @@ That's all! Linux kernel initialization process is finished! Conclusion -------------------------------------------------------------------------------- -It is the end of the tenth part about the linux kernel [initialization process](https://0xax.gitbook.io/linux-insides/summary/initialization). It is not only the `tenth` part, but also is the last part which describes initialization of the linux kernel. As I wrote in the first [part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-1) of this chapter, we will go through all steps of the kernel initialization and we did it. We started at the first architecture-independent function - `start_kernel` and finished with the launch of the first `init` process in the our system. I skipped details about different subsystem of the kernel, for example I almost did not cover scheduler, interrupts, exception handling, etc. From the next part we will start to dive to the different kernel subsystems. Hope it will be interesting. +It is the end of the tenth part about the Linux kernel [initialization process](https://0xax.gitbook.io/linux-insides/summary/initialization). It is not only the `tenth` part, but also is the last part which describes initialization of the linux kernel. As I wrote in the first [part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-1) of this chapter, we will go through all steps of the kernel initialization and we did it. We started at the first architecture-independent function - `start_kernel` and finished with the launch of the first `init` process in the our system. I skipped details about different subsystem of the kernel, for example I almost did not cover scheduler, interrupts, exception handling, etc. From the next part we will start to dive to the different kernel subsystems. Hope it will be interesting. If you have any questions or suggestions write me a comment or ping me at [twitter](https://twitter.com/0xAX). diff --git a/Initialization/linux-initialization-2.md b/Initialization/linux-initialization-2.md index 2b26205f..3e9bcbea 100644 --- a/Initialization/linux-initialization-2.md +++ b/Initialization/linux-initialization-2.md @@ -612,7 +612,7 @@ All what this function does is just returns `1` if the exception is generated be Conclusion -------------------------------------------------------------------------------- -This is the end of the second part about linux kernel insides. If you have questions or suggestions, ping me in twitter [0xAX](https://twitter.com/0xAX), drop me [email](mailto:anotherworldofworld@gmail.com) or just create [issue](https://github.com/0xAX/linux-insides/issues/new). In the next part we will see all steps before kernel entry point - `start_kernel` function. +This is the end of the second part about Linux kernel insides. If you have questions or suggestions, ping me in twitter [0xAX](https://twitter.com/0xAX), drop me [email](mailto:anotherworldofworld@gmail.com) or just create [issue](https://github.com/0xAX/linux-insides/issues/new). In the next part we will see all steps before kernel entry point - `start_kernel` function. **Please note that English is not my first language and I am really sorry for any inconvenience. If you found any mistakes please send me PR to [linux-insides](https://github.com/0xAX/linux-insides).** diff --git a/Initialization/linux-initialization-3.md b/Initialization/linux-initialization-3.md index 0b86270b..ad916852 100644 --- a/Initialization/linux-initialization-3.md +++ b/Initialization/linux-initialization-3.md @@ -4,7 +4,7 @@ Kernel initialization. Part 3. Last preparations before the kernel entry point -------------------------------------------------------------------------------- -This is the third part of the Linux kernel initialization process series. In the previous [part](https://github.com/0xAX/linux-insides/blob/master/Initialization/linux-initialization-2.md) we saw early interrupt and exception handling and will continue to dive into the linux kernel initialization process in the current part. Our next point is 'kernel entry point' - `start_kernel` function from the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c) source code file. Yes, technically it is not kernel's entry point but the start of the generic kernel code which does not depend on certain architecture. But before we call the `start_kernel` function, we must do some preparations. So let's continue. +This is the third part of the Linux kernel initialization process series. In the previous [part](https://github.com/0xAX/linux-insides/blob/master/Initialization/linux-initialization-2.md) we saw early interrupt and exception handling and will continue to dive into the Linux kernel initialization process in the current part. Our next point is 'kernel entry point' - `start_kernel` function from the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c) source code file. Yes, technically it is not kernel's entry point but the start of the generic kernel code which does not depend on certain architecture. But before we call the `start_kernel` function, we must do some preparations. So let's continue. boot_params again -------------------------------------------------------------------------------- @@ -29,10 +29,10 @@ Now let's look at `__va` macro. This macro defined in [init/main.c](https://gith #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) ``` -where `PAGE_OFFSET` is `__PAGE_OFFSET` which is `0xffff880000000000` and the base virtual address of the direct mapping of all physical memory. So we're getting virtual address of the `boot_params` structure and pass it to the `copy_bootdata` function, where we copy `real_mod_data` to the `boot_params` which is declared in the [arch/x86/include/asm/setup.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/setup.h) +where `PAGE_OFFSET` is `__PAGE_OFFSET` which is `0xffff880000000000` and the base virtual address of the direct mapping of all physical memory. So we're getting virtual address of variable `boot_params` which come along from real mode, and pass it to the `copy_bootdata` function, where we copy `real_mode_data` to the `boot_params` which is defined in the [arch/x86/kernel/setup.c](https://github.com/torvalds/linux/blob/d9919d43cbf6790d2bc0c0a2743c51fc25f26919/arch/x86/kernel/setup.c) ```C -extern struct boot_params boot_params; +struct boot_params boot_params; ``` Let's look at the `copy_boot_data` implementation: @@ -181,7 +181,7 @@ if (paravirt_enabled()) return; ``` -we exit from the `reserve_ebda_region` function if paravirtualization is enabled because if it enabled the extended bios data area is absent. In the next step we need to get the end of the low memory: +we exit from the `reserve_ebda_region` function if paravirtualization is enabled because if it enabled the extended BIOS data area is absent. In the next step we need to get the end of the low memory: ```C lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES); @@ -205,7 +205,7 @@ static inline unsigned int get_bios_ebda(void) } ``` -Let's try to understand how it works. Here we can see that we converting physical address `0x40E` to the virtual, where `0x0040:0x000e` is the segment which contains base address of the extended BIOS data area. Don't worry that we are using `phys_to_virt` function for converting a physical address to virtual address. You can note that previously we have used `__va` macro for the same point, but `phys_to_virt` is the same: +Let's try to understand how it works. Here we can see that we are converting physical address `0x40E` to the virtual, where `0x0040:0x000e` is the segment which contains base address of the extended BIOS data area. Don't worry that we are using `phys_to_virt` function for converting a physical address to virtual address. You can note that previously we have used `__va` macro for the same point, but `phys_to_virt` is the same: ```C static inline void *phys_to_virt(phys_addr_t address) @@ -242,7 +242,7 @@ which is: #define INSANE_CUTOFF 0x20000U ``` -or 128 kilobytes. In the last step we get lower part in the low memory and extended bios data area and call `memblock_reserve` function which will reserve memory region for extended bios data between low memory and one megabyte mark: +or 128 kilobytes. In the last step we get lower part in the low memory and extended BIOS data area and call `memblock_reserve` function which will reserve memory region for extended BIOS data between low memory and one megabyte mark: ```C lowmem = min(lowmem, ebda_addr); @@ -255,12 +255,12 @@ memblock_reserve(lowmem, 0x100000 - lowmem); * base physical address; * region size. -and reserves memory region for the given base address and size. `memblock_reserve` is the first function in this book from linux kernel memory manager framework. We will take a closer look on memory manager soon, but now let's look at its implementation. +and reserves memory region for the given base address and size. `memblock_reserve` is the first function in this book from Linux kernel memory manager framework. We will take a closer look on memory manager soon, but now let's look at its implementation. -First touch of the linux kernel memory manager framework +First touch of the Linux kernel memory manager framework -------------------------------------------------------------------------------- -In the previous paragraph we stopped at the call of the `memblock_reserve` function and as i said before it is the first function from the memory manager framework. Let's try to understand how it works. `memblock_reserve` function just calls: +In the previous paragraph we stopped at the call of the `memblock_reserve` function and as I said before it is the first function from the memory manager framework. Let's try to understand how it works. `memblock_reserve` function just calls: ```C memblock_reserve_region(base, size, MAX_NUMNODES, 0); @@ -290,7 +290,7 @@ struct memblock_type { }; ``` -As we need to reserve memory block for extended bios data area, the type of the current memory region is reserved where `memblock` structure is: +As we need to reserve memory block for extended BIOS data area, the type of the current memory region is reserved where `memblock` structure is: ```C struct memblock { @@ -401,7 +401,7 @@ static inline void memblock_set_region_node(struct memblock_region *r, int nid) } ``` -After this we will have first reserved `memblock` for the extended bios data area in the `.meminit.data` section. `reserve_ebda_region` function finished its work on this step and we can go back to the [arch/x86/kernel/head64.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/head64.c). +After this we will have first reserved `memblock` for the extended BIOS data area in the `.meminit.data` section. `reserve_ebda_region` function finished its work on this step and we can go back to the [arch/x86/kernel/head64.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/head64.c). We finished all preparations before the kernel entry point! The last step in the `x86_64_start_reservations` function is the call of the: @@ -416,7 +416,7 @@ That's all for this part. Conclusion -------------------------------------------------------------------------------- -It is the end of the third part about linux kernel insides. In next part we will see the first initialization steps in the kernel entry point - `start_kernel` function. It will be the first step before we will see launch of the first `init` process. +It is the end of the third part about Linux kernel insides. In next part we will see the first initialization steps in the kernel entry point - `start_kernel` function. It will be the first step before we will see launch of the first `init` process. If you have any questions or suggestions write me a comment or ping me at [twitter](https://twitter.com/0xAX). diff --git a/Initialization/linux-initialization-4.md b/Initialization/linux-initialization-4.md index fd21b853..f720f6d2 100644 --- a/Initialization/linux-initialization-4.md +++ b/Initialization/linux-initialization-4.md @@ -4,7 +4,7 @@ Kernel initialization. Part 4. Kernel entry point ================================================================================ -If you have read the previous part - [Last preparations before the kernel entry point](https://github.com/0xAX/linux-insides/blob/master/Initialization/linux-initialization-3.md), you can remember that we finished all pre-initialization stuff and stopped right before the call to the `start_kernel` function from the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c). The `start_kernel` is the entry of the generic and architecture independent kernel code, although we will return to the `arch/` folder many times. If you look inside of the `start_kernel` function, you will see that this function is very big. For this moment it contains about `86` calls of functions. Yes, it's very big and of course this part will not cover all the processes that occur in this function. In the current part we will only start to do it. This part and all the next which will be in the [Kernel initialization process](https://github.com/0xAX/linux-insides/blob/master/Initialization/README.md) chapter will cover it. +If you have read the previous part - [Last preparations before the kernel entry point](https://github.com/0xAX/linux-insides/blob/master/Initialization/linux-initialization-3.md), you can remember that we finished all pre-initialization stuff and stopped right before the call to the `start_kernel` function from the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c). The `start_kernel` is the entry of the generic and architecture independent kernel code, although we will return to the `arch/` folder many times. If you look inside of the `start_kernel` function, you will see that this function is very big. For this moment it contains about `86` function calls. Yes, it's very big and of course this part will not cover all the processes that occur in this function. In the current part we will only start to do it. This part and all the next which will be in the [Kernel initialization process](https://github.com/0xAX/linux-insides/blob/master/Initialization/README.md) chapter will cover it. The main purpose of the `start_kernel` to finish kernel initialization process and launch the first `init` process. Before the first process will be started, the `start_kernel` must do many things such as: to enable [lock validator](https://www.kernel.org/doc/Documentation/locking/lockdep-design.txt), to initialize processor id, to enable early [cgroups](http://en.wikipedia.org/wiki/Cgroups) subsystem, to setup per-cpu areas, to initialize different caches in [vfs](http://en.wikipedia.org/wiki/Virtual_file_system), to initialize memory manager, rcu, vmalloc, scheduler, IRQs, ACPI and many many more. Only after these steps will we see the launch of the first `init` process in the last part of this chapter. So much kernel code awaits us, let's start. @@ -53,7 +53,7 @@ struct task_struct init_task = INIT_TASK(init_task); where `task_struct` stores all the information about a process. I will not explain this structure in this book because it's very big. You can find its definition in [include/linux/sched.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/sched.h#L1278). At this moment `task_struct` contains more than `100` fields! Although you will not see the explanation of the `task_struct` in this book, we will use it very often since it is the fundamental structure which describes the `process` in the Linux kernel. I will describe the meaning of the fields of this structure as we meet them in practice. -You can see the definition of the `init_task` and it initialized by the `INIT_TASK` macro. This macro is from [include/linux/init_task.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/init_task.h) and it just fills the `init_task` with the values for the first process. For example it sets: +You can see the definition of the `init_task` and it is initialized by the `INIT_TASK` macro. This macro is from [include/linux/init_task.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/init_task.h) and it just fills the `init_task` with the values for the first process. For example it sets: * init process state to zero or `runnable`. A runnable process is one which is waiting only for a CPU to run on; * init process flags - `PF_KTHREAD` which means - kernel thread; @@ -63,7 +63,7 @@ You can see the definition of the `init_task` and it initialized by the `INIT_TA ```C union thread_union { - struct thread_info thread_info; + struct thread_info thread_info; unsigned long stack[THREAD_SIZE/sizeof(long)]; }; ``` @@ -74,7 +74,7 @@ Every process has its own stack and it is 16 kilobytes or 4 page frames in `x86_ struct thread_info { struct task_struct *task; struct exec_domain *exec_domain; - __u32 flags; + __u32 flags; __u32 status; __u32 cpu; int saved_preempt_count; @@ -179,7 +179,7 @@ As we got the end of the `init` process stack, we write `STACK_END_MAGIC` there. if (*end_of_stack(task) != STACK_END_MAGIC) { // // handle stack overflow here - // + // } ``` @@ -263,16 +263,16 @@ Remember that we have passed `cpu_number` as `pcp` to the `this_cpu_read` from t __bad_size_call_parameter(); break; \ } \ pscr_ret__; \ -}) +}) ``` -Yes, it looks a little strange but it's easy. First of all we can see the definition of the `pscr_ret__` variable with the `int` type. Why int? Ok, `variable` is `common_cpu` and it was declared as per-cpu int variable: +Yes, it looks a little strange but it's easy. First of all we can see the definition of the `pscr_ret__` variable with the `int` type. Why int? Ok, `variable` is `cpu_number` and it was declared as per-cpu int variable: ```C DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number); ``` -In the next step we call `__verify_pcpu_ptr` with the address of `cpu_number`. `__veryf_pcpu_ptr` used to verify that the given parameter is a per-cpu pointer. After that we set `pscr_ret__` value which depends on the size of the variable. Our `common_cpu` variable is `int`, so it 4 bytes in size. It means that we will get `this_cpu_read_4(common_cpu)` in `pscr_ret__`. In the end of the `__pcpu_size_call_return` we just call it. `this_cpu_read_4` is a macro: +In the next step we call `__verify_pcpu_ptr` with the address of `cpu_number`. `__veryf_pcpu_ptr` used to verify that the given parameter is a per-cpu pointer. After that we set `pscr_ret__` value which depends on the size of the variable. Our `cpu_number` variable is `int`, so it's 4 bytes in size. It means that we will get `this_cpu_read_4(cpu_number)` in `pscr_ret__`. In the end of the `__pcpu_size_call_return` we just call it. `this_cpu_read_4` is a macro: ```C #define this_cpu_read_4(pcp) percpu_from_op("mov", pcp) @@ -281,19 +281,19 @@ In the next step we call `__verify_pcpu_ptr` with the address of `cpu_number`. ` which calls `percpu_from_op` and pass `mov` instruction and per-cpu variable there. `percpu_from_op` will expand to the inline assembly call: ```C -asm("movl %%gs:%1,%0" : "=r" (pfo_ret__) : "m" (common_cpu)) +asm("movl %%gs:%1,%0" : "=r" (pfo_ret__) : "m" (cpu_number)) ``` -Let's try to understand how it works and what it does. The `gs` segment register contains the base of per-cpu area. Here we just copy `common_cpu` which is in memory to the `pfo_ret__` with the `movl` instruction. Or with another words: +Let's try to understand how it works and what it does. The `gs` segment register contains the base of per-cpu area. Here we just copy `cpu_number` which is in memory to the `pfo_ret__` with the `movl` instruction. Or with another words: ```C -this_cpu_read(common_cpu) +this_cpu_read(cpu_number) ``` is the same as: ```C -movl %gs:$common_cpu, $pfo_ret__ +movl %gs:$cpu_number, $pfo_ret__ ``` As we didn't setup per-cpu area, we have only one - for the current running CPU, we will get `zero` as a result of the `smp_processor_id`. @@ -350,7 +350,7 @@ If you're not sure that this `set_cpu_*` operations and `cpumask` are not clear As we activated the bootstrap processor, it's time to go to the next function in the `start_kernel.` Now it is `page_address_init`, but this function does nothing in our case, because it executes only when all `RAM` can't be mapped directly. -Print linux banner +Print Linux banner --------------------------------------------------------------------------------- The next call is `pr_notice`: diff --git a/Initialization/linux-initialization-5.md b/Initialization/linux-initialization-5.md index 95c26f0e..ad0fd09a 100644 --- a/Initialization/linux-initialization-5.md +++ b/Initialization/linux-initialization-5.md @@ -31,7 +31,7 @@ We already saw implementation of the `set_intr_gate` in the previous part about * base address of the interrupt/exception handler; * third parameter is - `Interrupt Stack Table`. `IST` is a new mechanism in the `x86_64` and part of the [TSS](http://en.wikipedia.org/wiki/Task_state_segment). Every active thread in kernel mode has own kernel stack which is `16` kilobytes. While a thread in user space, this kernel stack is empty. -In addition to per-thread stacks, there are a couple of specialized stacks associated with each CPU. All about these stack you can read in the linux kernel documentation - [Kernel stacks](https://www.kernel.org/doc/Documentation/x86/kernel-stacks). `x86_64` provides feature which allows to switch to a new `special` stack for during any events as non-maskable interrupt and etc... And the name of this feature is - `Interrupt Stack Table`. There can be up to 7 `IST` entries per CPU and every entry points to the dedicated stack. In our case this is `DEBUG_STACK`. +In addition to per-thread stacks, there are a couple of specialized stacks associated with each CPU. All about these stack you can read in the Linux kernel documentation - [Kernel stacks](https://www.kernel.org/doc/Documentation/x86/kernel-stacks). `x86_64` provides feature which allows to switch to a new `special` stack for during any events as non-maskable interrupt and etc... And the name of this feature is - `Interrupt Stack Table`. There can be up to 7 `IST` entries per CPU and every entry points to the dedicated stack. In our case this is `DEBUG_STACK`. `set_intr_gate_ist` and `set_system_intr_gate_ist` work by the same principle as `set_intr_gate` with only one difference. Both of these functions checks interrupt number and call `_set_gate` inside: @@ -43,12 +43,12 @@ _set_gate(n, GATE_INTERRUPT, addr, 0, ist, __KERNEL_CS); as `set_intr_gate` does this. But `set_intr_gate` calls `_set_gate` with [dpl](http://en.wikipedia.org/wiki/Privilege_level) - 0, and ist - 0, but `set_intr_gate_ist` and `set_system_intr_gate_ist` sets `ist` as `DEBUG_STACK` and `set_system_intr_gate_ist` sets `dpl` as `0x3` which is the lowest privilege. When an interrupt occurs and the hardware loads such a descriptor, then hardware automatically sets the new stack pointer based on the IST value, then invokes the interrupt handler. All of the special kernel stacks will be set in the `cpu_init` function (we will see it later). -As `#DB` and `#BP` gates written to the `idt_descr`, we reload `IDT` table with `load_idt` which just call `ldtr` instruction. Now let's look on interrupt handlers and will try to understand how they works. Of course, I can't cover all interrupt handlers in this book and I do not see the point in this. It is very interesting to delve in the linux kernel source code, so we will see how `debug` handler implemented in this part, and understand how other interrupt handlers are implemented will be your task. +As `#DB` and `#BP` gates written to the `idt_descr`, we reload `IDT` table with `load_idt` which just call `ldtr` instruction. Now let's look on interrupt handlers and will try to understand how they works. Of course, I can't cover all interrupt handlers in this book and I do not see the point in this. It is very interesting to delve in the Linux kernel source code, so we will see how `debug` handler implemented in this part, and understand how other interrupt handlers are implemented will be your task. #DB handler -------------------------------------------------------------------------------- -As you can read above, we passed address of the `#DB` handler as `&debug` in the `set_intr_gate_ist`. [lxr.free-electrons.com](http://lxr.free-electrons.com/ident) is a great resource for searching identifiers in the linux kernel source code, but unfortunately you will not find `debug` handler with it. All of you can find, it is `debug` definition in the [arch/x86/include/asm/traps.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/traps.h): +As you can read above, we passed address of the `#DB` handler as `&debug` in the `set_intr_gate_ist`. [lxr.free-electrons.com](http://lxr.free-electrons.com/ident) is a great resource for searching identifiers in the Linux kernel source code, but unfortunately you will not find `debug` handler with it. All of you can find, it is `debug` definition in the [arch/x86/include/asm/traps.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/traps.h): ```C asmlinkage void debug(void); @@ -163,9 +163,9 @@ The next step is initialization of early `ioremap`. In general there are two way * I/O Ports; * Device memory. -We already saw first method (`outb/inb` instructions) in the part about linux kernel booting [process](https://0xax.gitbook.io/linux-insides/summary/booting/linux-bootstrap-3). The second method is to map I/O physical addresses to virtual addresses. When a physical address is accessed by the CPU, it may refer to a portion of physical RAM which can be mapped on memory of the I/O device. So `ioremap` used to map device memory into kernel address space. +We already saw first method (`outb/inb` instructions) in the part about Linux kernel booting [process](https://0xax.gitbook.io/linux-insides/summary/booting/linux-bootstrap-3). The second method is to map I/O physical addresses to virtual addresses. When a physical address is accessed by the CPU, it may refer to a portion of physical RAM which can be mapped on memory of the I/O device. So `ioremap` used to map device memory into kernel address space. -As i wrote above next function is the `early_ioremap_init` which re-maps I/O memory to kernel address space so it can access it. We need to initialize early ioremap for early initialization code which needs to temporarily map I/O or memory regions before the normal mapping functions like `ioremap` are available. Implementation of this function is in the [arch/x86/mm/ioremap.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/mm/ioremap.c). At the start of the `early_ioremap_init` we can see definition of the `pmd` pointer with `pmd_t` type (which presents page middle directory entry `typedef struct { pmdval_t pmd; } pmd_t;` where `pmdval_t` is `unsigned long`) and make a check that `fixmap` aligned in a correct way: +As I wrote above next function is the `early_ioremap_init` which re-maps I/O memory to kernel address space so it can access it. We need to initialize early ioremap for early initialization code which needs to temporarily map I/O or memory regions before the normal mapping functions like `ioremap` are available. Implementation of this function is in the [arch/x86/mm/ioremap.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/mm/ioremap.c). At the start of the `early_ioremap_init` we can see definition of the `pmd` pointer with `pmd_t` type (which presents page middle directory entry `typedef struct { pmdval_t pmd; } pmd_t;` where `pmdval_t` is `unsigned long`) and make a check that `fixmap` aligned in a correct way: ```C pmd_t *pmd; @@ -198,7 +198,7 @@ After early `ioremap` was initialized, you can see the following code: ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); ``` -This code obtains major and minor numbers for the root device where `initrd` will be mounted later in the `do_mount_root` function. Major number of the device identifies a driver associated with the device. Minor number referred on the device controlled by driver. Note that `old_decode_dev` takes one parameter from the `boot_params_structure`. As we can read from the x86 linux kernel boot protocol: +This code obtains major and minor numbers for the root device where `initrd` will be mounted later in the `do_mount_root` function. Major number of the device identifies a driver associated with the device. Minor number referred on the device controlled by driver. Note that `old_decode_dev` takes one parameter from the `boot_params_structure`. As we can read from the x86 Linux kernel boot protocol: ``` Field name: root_dev @@ -298,7 +298,7 @@ presents abstraction for a tree-like subset of system resources. This structure | +-------------+ | | -| child | +| child | | | +-------------+ ``` @@ -408,7 +408,7 @@ static inline void __init copy_edd(void) Memory descriptor initialization -------------------------------------------------------------------------------- -The next step is initialization of the memory descriptor of the init process. As you already can know every process has its own address space. This address space presented with special data structure which called `memory descriptor`. Directly in the linux kernel source code memory descriptor presented with `mm_struct` structure. `mm_struct` contains many different fields related with the process address space as start/end address of the kernel code/data, start/end of the brk, number of memory areas, list of memory areas and etc... This structure defined in the [include/linux/mm_types.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/mm_types.h). As every process has its own memory descriptor, `task_struct` structure contains it in the `mm` and `active_mm` field. And our first `init` process has it too. You can remember that we saw the part of initialization of the init `task_struct` with `INIT_TASK` macro in the previous [part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-4): +The next step is initialization of the memory descriptor of the init process. As you already can know every process has its own address space. This address space presented with special data structure which called `memory descriptor`. Directly in the Linux kernel source code memory descriptor presented with `mm_struct` structure. `mm_struct` contains many different fields related with the process address space as start/end address of the kernel code/data, start/end of the brk, number of memory areas, list of memory areas and etc... This structure defined in the [include/linux/mm_types.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/mm_types.h). As every process has its own memory descriptor, `task_struct` structure contains it in the `mm` and `active_mm` field. And our first `init` process has it too. You can remember that we saw the part of initialization of the init `task_struct` with `INIT_TASK` macro in the previous [part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-4): ```C #define INIT_TASK(tsk) \ @@ -422,7 +422,7 @@ The next step is initialization of the memory descriptor of the init process. As } ``` -`mm` points to the process address space and `active_mm` points to the active address space if process has no address space such as kernel threads (more about it you can read in the [documentation](https://www.kernel.org/doc/Documentation/vm/active_mm.txt)). Now we fill memory descriptor of the initial process: +`mm` points to the process address space and `active_mm` points to the active address space if process has no address space such as kernel threads (more about it you can read in the [documentation](https://www.kernel.org/doc/Documentation/vm/active_mm.txt)). Now we fill memory descriptor of the initial process: ```C init_mm.start_code = (unsigned long) _text; @@ -457,7 +457,7 @@ where `mm_rb` is a red-black tree of the virtual memory areas, `pgd` is a pointe bss_resource.end = __pa_symbol(__bss_stop)-1; ``` -We already know a little about `resource` structure (read above). Here we fills code/data/bss resources with their physical addresses. You can see it in the `/proc/iomem`: +We already know a little about `resource` structure (read above). Here we fill code/data/bss resources with their physical addresses. You can see it in the `/proc/iomem`: ```C 00100000-be825fff : System RAM @@ -477,7 +477,7 @@ static struct resource code_resource = { }; ``` -The last step which we will cover in this part will be `NX` configuration. `NX-bit` or no execute bit is 63-bit in the page directory entry which controls the ability to execute code from all physical pages mapped by the table entry. This bit can only be used/set when the `no-execute` page-protection mechanism is enabled by the setting `EFER.NXE` to 1. In the `x86_configure_nx` function we check that CPU has support of `NX-bit` and it does not disabled. After the check we fill `__supported_pte_mask` depend on it: +The last step which we will cover in this part will be `NX` configuration. `NX-bit` or no execute bit is 63-bit in the page directory entry which controls the ability to execute code from all physical pages mapped by the table entry. This bit can only be used/set when the `no-execute` page-protection mechanism is enabled by the setting `EFER.NXE` to 1. In the `x86_configure_nx` function we check that CPU has support of `NX-bit` and it does not disabled. After the check we fill `__supported_pte_mask` depend on it: ```C void x86_configure_nx(void) @@ -492,7 +492,7 @@ void x86_configure_nx(void) Conclusion -------------------------------------------------------------------------------- -It is the end of the fifth part about linux kernel initialization process. In this part we continued to dive in the `setup_arch` function which makes initialization of architecture-specific stuff. It was long part, but we have not finished with it. As i already wrote, the `setup_arch` is big function, and I am really not sure that we will cover all of it even in the next part. There were some new interesting concepts in this part like `Fix-mapped` addresses, ioremap and etc... Don't worry if they are unclear for you. There is a special part about these concepts - [Linux kernel memory management Part 2.](https://github.com/0xAX/linux-insides/blob/master/MM/linux-mm-2.md). In the next part we will continue with the initialization of the architecture-specific stuff and will see parsing of the early kernel parameters, early dump of the pci devices, `Desktop Management Interface` scanning and many many more. +It is the end of the fifth part about Linux kernel initialization process. In this part we continued to dive in the `setup_arch` function which makes initialization of architecture-specific stuff. It was long part, but we are not finished with it. As I already wrote, the `setup_arch` is big function, and I am really not sure that we will cover all of it even in the next part. There were some new interesting concepts in this part like `Fix-mapped` addresses, ioremap and etc... Don't worry if they are unclear for you. There is a special part about these concepts - [Linux kernel memory management Part 2.](https://github.com/0xAX/linux-insides/blob/master/MM/linux-mm-2.md). In the next part we will continue with the initialization of the architecture-specific stuff and will see parsing of the early kernel parameters, early dump of the pci devices, `Desktop Management Interface` scanning and many many more. If you have any questions or suggestions write me a comment or ping me at [twitter](https://twitter.com/0xAX). @@ -504,7 +504,7 @@ Links * [mm vs active_mm](https://www.kernel.org/doc/Documentation/vm/active_mm.txt) * [e820](http://en.wikipedia.org/wiki/E820) * [Supervisor mode access prevention](https://lwn.net/Articles/517475/) -* [Kernel stacks](https://www.kernel.org/doc/Documentation/x86/x86_64/kernel-stacks) +* [Kernel stacks](https://www.kernel.org/doc/Documentation/x86/kernel-stacks) * [TSS](http://en.wikipedia.org/wiki/Task_state_segment) * [IDT](http://en.wikipedia.org/wiki/Interrupt_descriptor_table) * [Memory mapped I/O](http://en.wikipedia.org/wiki/Memory-mapped_I/O) diff --git a/Initialization/linux-initialization-6.md b/Initialization/linux-initialization-6.md index 43fd5fa1..d15279db 100644 --- a/Initialization/linux-initialization-6.md +++ b/Initialization/linux-initialization-6.md @@ -4,7 +4,7 @@ Kernel initialization. Part 6. Architecture-specific initialization, again... ================================================================================ -In the previous [part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-5) we saw architecture-specific (`x86_64` in our case) initialization stuff from the [arch/x86/kernel/setup.c](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/setup.c) and finished on `x86_configure_nx` function which sets the `_PAGE_NX` flag depends on support of [NX bit](http://en.wikipedia.org/wiki/NX_bit). As I wrote before `setup_arch` function and `start_kernel` are very big, so in this and in the next part we will continue to learn about architecture-specific initialization process. The next function after `x86_configure_nx` is `parse_early_param`. This function is defined in the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c) and as you can understand from its name, this function parses kernel command line and setups different services depends on the given parameters (all kernel command line parameters you can find are in the [Documentation/kernel-parameters.txt](https://github.com/torvalds/linux/blob/master/Documentation/admin-guide/kernel-parameters.rst)). You may remember how we setup `earlyprintk` in the earliest [part](https://0xax.gitbook.io/linux-insides/summary/booting/linux-bootstrap-2). On the early stage we looked for kernel parameters and their value with the `cmdline_find_option` function and `__cmdline_find_option`, `__cmdline_find_option_bool` helpers from the [arch/x86/boot/cmdline.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/boot/cmdline.c). There we're in the generic kernel part which does not depend on architecture and here we use another approach. If you are reading linux kernel source code, you already note calls like this: +In the previous [part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-5) we saw architecture-specific (`x86_64` in our case) initialization stuff from the [arch/x86/kernel/setup.c](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/setup.c) and finished on `x86_configure_nx` function which sets the `_PAGE_NX` flag depends on support of [NX bit](http://en.wikipedia.org/wiki/NX_bit). As I wrote before `setup_arch` function and `start_kernel` are very big, so in this and in the next part we will continue to learn about architecture-specific initialization process. The next function after `x86_configure_nx` is `parse_early_param`. This function is defined in the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c) and as you can understand from its name, this function parses kernel command line and setups different services depends on the given parameters (all kernel command line parameters you can find are in the [Documentation/kernel-parameters.txt](https://github.com/torvalds/linux/blob/master/Documentation/admin-guide/kernel-parameters.rst)). You may remember how we setup `earlyprintk` in the earliest [part](https://0xax.gitbook.io/linux-insides/summary/booting/linux-bootstrap-2). On the early stage we looked for kernel parameters and their value with the `cmdline_find_option` function and `__cmdline_find_option`, `__cmdline_find_option_bool` helpers from the [arch/x86/boot/cmdline.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/boot/cmdline.c). There we're in the generic kernel part which does not depend on architecture and here we use another approach. If you are reading Linux kernel source code, you already note calls like this: ```C early_param("gbpages", parse_direct_gbpages_on); @@ -60,7 +60,7 @@ Note that `__set_param` macro defines with `__section(.init.setup)` attribute. I VMLINUX_SYMBOL(__setup_end) = .; ``` -Now we know how parameters are defined, let's back to the `parse_early_param` implementation: +Now we know how parameters are defined, let's back to the `parse_early_param` implementation: ```C void __init parse_early_param(void) @@ -128,7 +128,7 @@ int __init acpi_mps_check(void) } ``` -It checks the built-in `MPS` or [MultiProcessor Specification](http://en.wikipedia.org/wiki/MultiProcessor_Specification) table. If `CONFIG_X86_LOCAL_APIC` is set and `CONFIG_x86_MPPAARSE` is not set, `acpi_mps_check` prints warning message if the one of the command line options: `acpi=off`, `acpi=noirq` or `pci=noacpi` passed to the kernel. If `acpi_mps_check` returns `1` it means that we disable local [APIC](http://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller) and clear `X86_FEATURE_APIC` bit in the of the current CPU with the `setup_clear_cpu_cap` macro. (more about CPU mask you can read in the [CPU masks](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-2)). +It checks the built-in `MPS` or [MultiProcessor Specification](http://en.wikipedia.org/wiki/MultiProcessor_Specification) table. If `CONFIG_X86_LOCAL_APIC` is set and `CONFIG_x86_MPPARSE` is not set, `acpi_mps_check` prints warning message if the one of the command line options: `acpi=off`, `acpi=noirq` or `pci=noacpi` passed to the kernel. If `acpi_mps_check` returns `1` it means that we disable local [APIC](http://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller) and clear `X86_FEATURE_APIC` bit in the of the current CPU with the `setup_clear_cpu_cap` macro. (more about CPU mask you can read in the [CPU masks](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-2)). Early PCI dump -------------------------------------------------------------------------------- @@ -165,7 +165,7 @@ char *__init pcibios_setup(char *str) { } ``` -So, if `CONFIG_PCI` option is set and we passed `pci=earlydump` option to the kernel command line, next function which will be called - `early_dump_pci_devices` from the [arch/x86/pci/early.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/pci/early.c). This function checks `noearly` pci parameter with: +So, if `CONFIG_PCI` option is set and we passed `pci=earlydump` option to the kernel command line, next function which will be called - `early_dump_pci_devices` from the [arch/x86/pci/early.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/pci/early.c). This function checks `noearly` PCI parameter with: ```C if (!early_pci_allowed()) @@ -208,7 +208,7 @@ After the `early_dump_pci_devices`, there are a couple of function related with early_reserve_e820_mpc_new(); ``` -Let's look on it. As you can see the first function is `e820_reserve_setup_data`. This function does almost the same as `memblock_x86_reserve_range_setup_data` which we saw above, but it also calls `e820_update_range` which adds new regions to the `e820map` with the given type which is `E820_RESERVED_KERN` in our case. The next function is `finish_e820_parsing` which sanitizes `e820map` with the `sanitize_e820_map` function. Besides this two functions we can see a couple of functions related to the [e820](http://en.wikipedia.org/wiki/E820). You can see it in the listing above. `e820_add_kernel_range` function takes the physical address of the kernel start and end: +Let's look at it. As you can see the first function is `e820_reserve_setup_data`. This function does almost the same as `memblock_x86_reserve_range_setup_data` which we saw above, but it also calls `e820_update_range` which adds new regions to the `e820map` with the given type which is `E820_RESERVED_KERN` in our case. The next function is `finish_e820_parsing` which sanitizes `e820map` with the `sanitize_e820_map` function. Besides this two functions we can see a couple of functions related to the [e820](http://en.wikipedia.org/wiki/E820). You can see it in the listing above. `e820_add_kernel_range` function takes the physical address of the kernel start and end: ```C u64 start = __pa_symbol(_text); @@ -273,13 +273,13 @@ if (max_pfn > (1UL<<(32 - PAGE_SHIFT))) max_low_pfn = e820_end_of_low_ram_pfn(); else max_low_pfn = max_pfn; - + high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; ``` Next we calculate `high_memory` (defines the upper bound on direct map memory) with `__va` macro which returns a virtual address by the given physical memory. -DMI scanning +DMI scanning ------------------------------------------------------------------------------- The next step after manipulations with different memory regions and `e820` slots is collecting information about computer. We will get all information with the [Desktop Management Interface](http://en.wikipedia.org/wiki/Desktop_Management_Interface) and following functions: @@ -356,7 +356,7 @@ RESERVE_BRK(dmi_alloc, 65536); #endif ``` -`RESERVE_BRK` defined in the [arch/x86/include/asm/setup.h](http://en.wikipedia.org/wiki/Desktop_Management_Interface) and reserves space with given size in the `brk` section. +`RESERVE_BRK` defined in the [arch/x86/include/asm/setup.h](http://github.com/torvalds/linux/blob/master/arch/x86/include/asm/setup.h) and reserves space with given size in the `brk` section. ------------------------- init_hypervisor_platform(); @@ -451,7 +451,7 @@ void __init early_alloc_pgt_buf(void) } ``` -First of all it get the size of the page table buffer, it will be `INIT_PGT_BUF_SIZE` which is `(6 * PAGE_SIZE)` in the current linux kernel 4.0. As we got the size of the page table buffer, we call `extend_brk` function with two parameters: size and align. As you can understand from its name, this function extends the `brk` area. As we can see in the linux kernel linker script `brk` is in memory right after the [BSS](http://en.wikipedia.org/wiki/.bss): +First of all it get the size of the page table buffer, it will be `INIT_PGT_BUF_SIZE` which is `(6 * PAGE_SIZE)` in the current Linux kernel 4.0. As we got the size of the page table buffer, we call `extend_brk` function with two parameters: size and align. As you can understand from its name, this function extends the `brk` area. As we can see in the linux kernel linker script `brk` is in memory right after the [BSS](http://en.wikipedia.org/wiki/.bss): ```C . = ALIGN(PAGE_SIZE); @@ -517,12 +517,12 @@ MEMBLOCK configuration: The rest functions after the `memblock_x86_fill` are: `early_reserve_e820_mpc_new` allocates additional slots in the `e820map` for MultiProcessor Specification table, `reserve_real_mode` - reserves low memory from `0x0` to 1 megabyte for the trampoline to the real mode (for rebooting, etc.), `trim_platform_memory_ranges` - trims certain memory regions started from `0x20050000`, `0x20110000`, etc. these regions must be excluded because [Sandy Bridge](http://en.wikipedia.org/wiki/Sandy_Bridge) has problems with these regions, `trim_low_memory_range` reserves the first 4 kilobyte page in `memblock`, `init_mem_mapping` function reconstructs direct memory mapping and setups the direct mapping of the physical memory at `PAGE_OFFSET`, `early_trap_pf_init` setups `#PF` handler (we will look on it in the chapter about interrupts) and `setup_real_mode` function setups trampoline to the [real mode](http://en.wikipedia.org/wiki/Real_mode) code. -That's all. You can note that this part will not cover all functions which are in the `setup_arch` (like `early_gart_iommu_check`, [mtrr](http://en.wikipedia.org/wiki/Memory_type_range_register) initialization, etc.). As I already wrote many times, `setup_arch` is big, and linux kernel is big. That's why I can't cover every line in the linux kernel. I don't think that we missed something important, but you can say something like: each line of code is important. Yes, it's true, but I missed them anyway, because I think that it is not realistic to cover full linux kernel. Anyway we will often return to the idea that we have already seen, and if something is unfamiliar, we will cover this theme. +That's all. You can note that this part will not cover all functions which are in the `setup_arch` (like `early_gart_iommu_check`, [mtrr](http://en.wikipedia.org/wiki/Memory_type_range_register) initialization, etc.). As I already wrote many times, `setup_arch` is big, and Linux kernel is big. That's why I can't cover every line in the linux kernel. I don't think that we missed something important, but you can say something like: each line of code is important. Yes, it's true, but I missed them anyway, because I think that it is not realistic to cover full linux kernel. Anyway we will often return to the idea that we have already seen, and if something is unfamiliar, we will cover this theme. Conclusion -------------------------------------------------------------------------------- -It is the end of the sixth part about linux kernel initialization process. In this part we continued to dive in the `setup_arch` function again and it was long part, but we are not finished with it. Yes, `setup_arch` is big, hope that next part will be the last part about this function. +It is the end of the sixth part about Linux kernel initialization process. In this part we continued to dive in the `setup_arch` function again and it was long part, but we are not finished with it. Yes, `setup_arch` is big, hope that next part will be the last part about this function. If you have any questions or suggestions write me a comment or ping me at [twitter](https://twitter.com/0xAX). diff --git a/Initialization/linux-initialization-7.md b/Initialization/linux-initialization-7.md index 7e02df20..8198a7c9 100644 --- a/Initialization/linux-initialization-7.md +++ b/Initialization/linux-initialization-7.md @@ -19,7 +19,7 @@ Now let's look on the implementation of the `setup_log_buf` function. It starts ```C if (log_buf != __log_buf) return; - + if (!early && !new_log_buf_len) log_buf_add_cpu(); ``` @@ -41,7 +41,7 @@ if (ramdisk_size >= (mapped_size>>1)) "disabling initrd (%lld needed, %lld available)\n", ramdisk_size, mapped_size>>1); ``` - + You can see here that we call `memblock_mem_size` function and pass the `max_pfn_mapped` to it, where `max_pfn_mapped` contains the highest direct mapped page frame number. If you do not remember what is `page frame number`, explanation is simple: First `12` bits of the virtual address represent offset in the physical page or page frame. If we right-shift out `12` bits of the virtual address, we'll discard offset part and will get `Page Frame Number`. In the `memblock_mem_size` we go through the all memblock `mem` (not reserved) regions and calculates size of the mapped pages and return it to the `mapped_size` variable (see code above). As we got amount of the direct mapped memory, we check that size of the `initrd` is not greater than mapped pages. If it is greater we just call `panic` which halts the system and prints famous [Kernel panic](http://en.wikipedia.org/wiki/Kernel_panic) message. In the next step we print information about the `initrd` size. We can see the result of this in the `dmesg` output: ```C @@ -183,13 +183,13 @@ function. The `cma_declare_contiguous` reserves contiguous area from the given b Initialization of the sparse memory -------------------------------------------------------------------------------- -The next step is the call of the function - `x86_init.paging.pagetable_init`. If you try to find this function in the linux kernel source code, in the end of your search, you will see the following macro: +The next step is the call of the function - `x86_init.paging.pagetable_init`. If you try to find this function in the Linux kernel source code, in the end of your search, you will see the following macro: ```C #define native_pagetable_init paging_init ``` -which expands as you can see to the call of the `paging_init` function from the [arch/x86/mm/init_64.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/mm/init_64.c). The `paging_init` function initializes sparse memory and zone sizes. First of all what's zones and what is it `Sparsemem`. The `Sparsemem` is a special foundation in the linux kernel memory manager which used to split memory area into different memory banks in the [NUMA](http://en.wikipedia.org/wiki/Non-uniform_memory_access) systems. Let's look on the implementation of the `paginig_init` function: +which expands as you can see to the call of the `paging_init` function from the [arch/x86/mm/init_64.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/mm/init_64.c). The `paging_init` function initializes sparse memory and zone sizes. First of all what's zones and what is it `Sparsemem`. The `Sparsemem` is a special foundation in the Linux kernel memory manager which used to split memory area into different memory banks in the [NUMA](http://en.wikipedia.org/wiki/Non-uniform_memory_access) systems. Let's look on the implementation of the `paging_init` function: ```C void __init paging_init(void) @@ -209,7 +209,7 @@ As you can see there is call of the `sparse_memory_present_with_active_regions` Again, this part and next parts do not cover this theme in full details. There will be special part about `NUMA`. -vsyscall mapping +vsyscall mapping -------------------------------------------------------------------------------- The next step after `SparseMem` initialization is setting of the `trampoline_cr4_features` which must contain content of the `cr4` [Control register](http://en.wikipedia.org/wiki/Control_register). First of all we need to check that current CPU has support of the `cr4` register and if it has, we save its content to the `trampoline_cr4_features` which is storage for `cr4` in the real mode: @@ -315,7 +315,7 @@ struct mpf_intel *mpf = mpf_found; if (!mpf) return; - + if (acpi_lapic && early) return; ``` @@ -325,7 +325,7 @@ Here we can see that multiprocessor configuration was found in the `smp_scan_con The rest of the setup_arch -------------------------------------------------------------------------------- -Here we are getting to the end of the `setup_arch` function. The rest of function of course is important, but details about these stuff will not will not be included in this part. We will just take a short look on these functions, because although they are important as I wrote above, but they cover non-generic kernel features related with the `NUMA`, `SMP`, `ACPI` and `APICs`, etc. First of all, the next call of the `init_apic_mappings` function. As we can understand this function sets the address of the local [APIC](http://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller). The next is `x86_io_apic_ops.init` and this function initializes I/O APIC. Please note that we will see all details related with `APIC` in the chapter about interrupts and exceptions handling. In the next step we reserve standard I/O resources like `DMA`, `TIMER`, `FPU`, etc., with the call of the `x86_init.resources.reserve_resources` function. Following is `mcheck_init` function initializes `Machine check Exception` and the last is `register_refined_jiffies` which registers [jiffy](http://en.wikipedia.org/wiki/Jiffy_%28time%29) (There will be separate chapter about timers in the kernel). +Here we are getting to the end of the `setup_arch` function. The rest of function of course is important, but details about these stuff will not will not be included in this part. We will just take a short look on these functions, because although they are important as I wrote above, they cover non-generic kernel features related with the `NUMA`, `SMP`, `ACPI` and `APICs`, etc. First of all, the next call of the `init_apic_mappings` function. As we can understand this function sets the address of the local [APIC](http://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller). The next is `x86_io_apic_ops.init` and this function initializes I/O APIC. Please note that we will see all details related with `APIC` in the chapter about interrupts and exceptions handling. In the next step we reserve standard I/O resources like `DMA`, `TIMER`, `FPU`, etc., with the call of the `x86_init.resources.reserve_resources` function. Following is `mcheck_init` function initializes `Machine check Exception` and the last is `register_refined_jiffies` which registers [jiffy](http://en.wikipedia.org/wiki/Jiffy_%28time%29) (There will be separate chapter about timers in the kernel). So that's all. Finally we have finished with the big `setup_arch` function in this part. Of course as I already wrote many times, we did not see full details about this function, but do not worry about it. We will be back more than once to this function from different chapters for understanding how different platform-dependent parts are initialized. @@ -360,7 +360,7 @@ This function takes pointer to the kernel command line allocates a couple of buf * `initcall_command_line` - will contain boot command line. will be used in the `do_initcall_level`; * `static_command_line` - will contain command line for parameters parsing. -We will allocate space with the `memblock_virt_alloc` function. This function calls `memblock_virt_alloc_try_nid` which allocates boot memory block with `memblock_reserve` if [slab](http://en.wikipedia.org/wiki/Slab_allocation) is not available or uses `kzalloc_node` (more about it will be in the linux memory management chapter). The `memblock_virt_alloc` uses `BOOTMEM_LOW_LIMIT` (physical address of the `(PAGE_OFFSET + 0x1000000)` value) and `BOOTMEM_ALLOC_ACCESSIBLE` (equal to the current value of the `memblock.current_limit`) as minimum address of the memory region and maximum address of the memory region. +We will allocate space with the `memblock_virt_alloc` function. This function calls `memblock_virt_alloc_try_nid` which allocates boot memory block with `memblock_reserve` if [slab](http://en.wikipedia.org/wiki/Slab_allocation) is not available or uses `kzalloc_node` (more about it will be in the Linux memory management chapter). The `memblock_virt_alloc` uses `BOOTMEM_LOW_LIMIT` (physical address of the `(PAGE_OFFSET + 0x1000000)` value) and `BOOTMEM_ALLOC_ACCESSIBLE` (equal to the current value of the `memblock.current_limit`) as minimum address of the memory region and maximum address of the memory region. Let's look on the implementation of the `setup_command_line`: @@ -458,7 +458,7 @@ That's all. Conclusion ================================================================================ -It is the end of the seventh part about the linux kernel initialization process. In this part, finally we have finished with the `setup_arch` function and returned to the `start_kernel` function. In the next part we will continue to learn generic kernel code from the `start_kernel` and will continue our way to the first `init` process. +It is the end of the seventh part about the Linux kernel initialization process. In this part, finally we have finished with the `setup_arch` function and returned to the `start_kernel` function. In the next part we will continue to learn generic kernel code from the `start_kernel` and will continue our way to the first `init` process. If you have any questions or suggestions write me a comment or ping me at [twitter](https://twitter.com/0xAX). diff --git a/Initialization/linux-initialization-8.md b/Initialization/linux-initialization-8.md index 9b74cf6e..1964f5df 100644 --- a/Initialization/linux-initialization-8.md +++ b/Initialization/linux-initialization-8.md @@ -75,7 +75,7 @@ static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu) The `get_cpu_gdt_table` uses `per_cpu` macro for getting value of a `gdt_page` percpu variable for the given CPU number (bootstrap processor with `id` - 0 in our case). -You may ask the following question: so, if we can access `gdt_page` percpu variable, where it was defined? Actually we already saw it in this book. If you have read the first [part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-1) of this chapter, you can remember that we saw definition of the `gdt_page` in the [arch/x86/kernel/head_64.S](https://github.com/0xAX/linux/blob/0a07b238e5f488b459b6113a62e06b6aab017f71/arch/x86/kernel/head_64.S): +You may ask the following question: so, if we can access `gdt_page` percpu variable, where was it defined? Actually we already saw it in this book. If you have read the first [part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-1) of this chapter, you can remember that we saw definition of the `gdt_page` in the [arch/x86/kernel/head_64.S](https://github.com/0xAX/linux/blob/0a07b238e5f488b459b6113a62e06b6aab017f71/arch/x86/kernel/head_64.S): ```assembly early_gdt_descr: @@ -117,29 +117,29 @@ void load_percpu_segment(int cpu) { } ``` -The base address of the `percpu` area must contain `gs` register (or `fs` register for `x86`), so we are using `loadsegment` macro and pass `gs`. In the next step we writes the base address if the [IRQ](http://en.wikipedia.org/wiki/Interrupt_request_%28PC_architecture%29) stack and setup stack [canary](http://en.wikipedia.org/wiki/Buffer_overflow_protection) (this is only for `x86_32`). After we load new `GDT`, we fill `cpu_callout_mask` bitmap with the current cpu and set cpu state as online with the setting `cpu_state` percpu variable for the current processor - `CPU_ONLINE`: +The base address of the `percpu` area must contain `gs` register (or `fs` register for `x86`), so we are using `loadsegment` macro and pass `gs`. In the next step we write the base address if the [IRQ](http://en.wikipedia.org/wiki/Interrupt_request_%28PC_architecture%29) stack and setup stack [canary](http://en.wikipedia.org/wiki/Buffer_overflow_protection) (this is only for `x86_32`). After we load new `GDT`, we fill `cpu_callout_mask` bitmap with the current cpu and set cpu state as online with the setting `cpu_state` percpu variable for the current processor - `CPU_ONLINE`: ```C cpumask_set_cpu(me, cpu_callout_mask); per_cpu(cpu_state, me) = CPU_ONLINE; ``` -So, what is `cpu_callout_mask` bitmap... As we initialized bootstrap processor (processor which is booted the first on `x86`) the other processors in a multiprocessor system are known as `secondary processors`. Linux kernel uses following two bitmasks: +So, what is `cpu_callout_mask` bitmap? As we initialized bootstrap processor (processor which is booted the first on `x86`) the other processors in a multiprocessor system are known as `secondary processors`. Linux kernel uses following two bitmasks: * `cpu_callout_mask` * `cpu_callin_mask` -After bootstrap processor initialized, it updates the `cpu_callout_mask` to indicate which secondary processor can be initialized next. All other or secondary processors can do some initialization stuff before and check the `cpu_callout_mask` on the bootstrap processor bit. Only after the bootstrap processor filled the `cpu_callout_mask` with this secondary processor, it will continue the rest of its initialization. After that the certain processor finish its initialization process, the processor sets bit in the `cpu_callin_mask`. Once the bootstrap processor finds the bit in the `cpu_callin_mask` for the current secondary processor, this processor repeats the same procedure for initialization of one of the remaining secondary processors. In a short words it works as i described, but we will see more details in the chapter about `SMP`. - +After bootstrap processor initialized, it updates the `cpu_callout_mask` to indicate which secondary processor can be initialized next. All other or secondary processors can do some initialization stuff before and check the `cpu_callout_mask` on the bootstrap processor bit. Only after the bootstrap processor filled the `cpu_callout_mask` with this secondary processor, it will continue the rest of its initialization. After that the certain processor finish its initialization process, the processor sets bit in the `cpu_callin_mask`. Once the bootstrap processor finds the bit in the `cpu_callin_mask` for the current secondary processor, this processor repeats the same procedure for initialization of one of the remaining secondary processors. In a short words it works as I described, but we will see more details in the chapter about `SMP`. + That's all. We did all `SMP` boot preparation. Build zonelists ----------------------------------------------------------------------- -In the next step we can see the call of the `build_all_zonelists` function. This function sets up the order of zones that allocations are preferred from. What are zones and what's order we will understand soon. For the start let's see how linux kernel considers physical memory. Physical memory is split into banks which are called - `nodes`. If you has no hardware support for `NUMA`, you will see only one node: +In the next step we can see the call of the `build_all_zonelists` function. This function sets up the order of zones that allocations are preferred from. What are zones and what's order we will understand soon. For the start let's see how Linux kernel considers physical memory. Physical memory is split into banks which are called - `nodes`. If you have no hardware support for `NUMA`, you will see only one node: ``` -$ cat /sys/devices/system/node/node0/numastat +$ cat /sys/devices/system/node/node0/numastat numa_hit 72452442 numa_miss 0 numa_foreign 0 @@ -148,7 +148,7 @@ local_node 72452442 other_node 0 ``` -Every `node` is presented by the `struct pglist_data` in the linux kernel. Each node is divided into a number of special blocks which are called - `zones`. Every zone is presented by the `zone struct` in the linux kernel and has one of the type: +Every `node` is presented by the `struct pglist_data` in the Linux kernel. Each node is divided into a number of special blocks which are called - `zones`. Every zone is presented by the `zone struct` in the linux kernel and has one of the type: * `ZONE_DMA` - 0-16M; * `ZONE_DMA32` - used for 32 bit devices that can only do DMA areas below 4G; @@ -185,13 +185,13 @@ As I wrote above all nodes are described with the `pglist_data` or `pg_data_t` s The rest of the stuff before scheduler initialization -------------------------------------------------------------------------------- -Before we will start to dive into linux kernel scheduler initialization process we must do a couple of things. The first thing is the `page_alloc_init` function from the [mm/page_alloc.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/mm/page_alloc.c). This function looks pretty easy: +Before we start to dive into Linux kernel scheduler initialization process we must do a couple of things. The first thing is the `page_alloc_init` function from the [mm/page_alloc.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/mm/page_alloc.c). This function looks pretty easy: ```C void __init page_alloc_init(void) { int ret; - + ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC_DEAD, "mm/page_alloc:dead", NULL, page_alloc_cpu_dead); @@ -205,11 +205,11 @@ After this function we can see the kernel command line in the initialization out ![kernel command line](images/kernel_command_line.png) -And a couple of functions such as `parse_early_param` and `parse_args` which handles linux kernel command line. You may remember that we already saw the call of the `parse_early_param` function in the sixth [part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-6) of the kernel initialization chapter, so why we call it again? Answer is simple: we call this function in the architecture-specific code (`x86_64` in our case), but not all architecture calls this function. And we need to call the second function `parse_args` to parse and handle non-early command line arguments. +And a couple of functions such as `parse_early_param` and `parse_args` which handles Linux kernel command line. You may remember that we already saw the call of the `parse_early_param` function in the sixth [part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-6) of the kernel initialization chapter, so why we call it again? Answer is simple: we call this function in the architecture-specific code (`x86_64` in our case), but not all architecture calls this function. And we need to call the second function `parse_args` to parse and handle non-early command line arguments. In the next step we can see the call of the `jump_label_init` from the [kernel/jump_label.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/jump_label.c). and initializes [jump label](https://lwn.net/Articles/412072/). -After this we can see the call of the `setup_log_buf` function which setups the [printk](http://www.makelinux.net/books/lkd2/ch18lev1sec3) log buffer. We already saw this function in the seventh [part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-7) of the linux kernel initialization process chapter. +After this we can see the call of the `setup_log_buf` function which setups the [printk](http://www.makelinux.net/books/lkd2/ch18lev1sec3) log buffer. We already saw this function in the seventh [part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-7) of the Linux kernel initialization process chapter. PID hash initialization -------------------------------------------------------------------------------- @@ -230,7 +230,7 @@ pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18, ``` The number of elements of the `pid_hash` depends on the `RAM` configuration, but it can be between `2^4` and `2^12`. The `pidhash_init` computes the size -and allocates the required storage (which is `hlist` in our case - the same as [doubly linked list](https://0xax.gitbook.io/linux-insides/summary/datastructures/linux-datastructures-1), but contains one pointer instead on the [struct hlist_head](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/types.h)]. The `alloc_large_system_hash` function allocates a large system hash table with `memblock_virt_alloc_nopanic` if we pass `HASH_EARLY` flag (as it in our case) or with `__vmalloc` if we did no pass this flag. +and allocates the required storage (which is `hlist` in our case - the same as [doubly linked list](https://0xax.gitbook.io/linux-insides/summary/datastructures/linux-datastructures-1), but contains one pointer instead on the [struct hlist_head](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/types.h). The `alloc_large_system_hash` function allocates a large system hash table with `memblock_virt_alloc_nopanic` if we pass `HASH_EARLY` flag (as it in our case) or with `__vmalloc` if we did no pass this flag. The result we can see in the `dmesg` output: @@ -244,7 +244,7 @@ $ dmesg | grep hash That's all. The rest of the stuff before scheduler initialization is the following functions: `vfs_caches_init_early` does early initialization of the [virtual file system](http://en.wikipedia.org/wiki/Virtual_file_system) (more about it will be in the chapter which will describe virtual file system), `sort_main_extable` sorts the kernel's built-in exception table entries which are between `__start___ex_table` and `__stop___ex_table`, and `trap_init` initializes trap handlers (more about last two function we will know in the separate chapter about interrupts). -The last step before the scheduler initialization is initialization of the memory manager with the `mm_init` function from the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c). As we can see, the `mm_init` function initializes different parts of the linux kernel memory manager: +The last step before the scheduler initialization is initialization of the memory manager with the `mm_init` function from the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c). As we can see, the `mm_init` function initializes different parts of the Linux kernel memory manager: ```C page_ext_init_flatmem(); @@ -255,7 +255,7 @@ pgtable_init(); vmalloc_init(); ``` -The first is `page_ext_init_flatmem` which depends on the `CONFIG_SPARSEMEM` kernel configuration option and initializes extended data per page handling. The `mem_init` releases all `bootmem`, the `kmem_cache_init` initializes kernel cache, the `percpu_init_late` - replaces `percpu` chunks with those allocated by [slub](http://en.wikipedia.org/wiki/SLUB_%28software%29), the `pgtable_init` - initializes the `page->ptl` kernel cache, the `vmalloc_init` - initializes `vmalloc`. Please, **NOTE** that we will not dive into details about all of these functions and concepts, but we will see all of they it in the [Linux kernel memory manager](https://0xax.gitbook.io/linux-insides/summary/mm) chapter. +The first is `page_ext_init_flatmem` which depends on the `CONFIG_SPARSEMEM` kernel configuration option and initializes extended data per page handling. The `mem_init` releases all `bootmem`, the `kmem_cache_init` initializes kernel cache, the `percpu_init_late` - replaces `percpu` chunks with those allocated by [slub](http://en.wikipedia.org/wiki/SLUB_%28software%29), the `pgtable_init` - initializes the `page->ptl` kernel cache, the `vmalloc_init` - initializes `vmalloc`. Please, **NOTE** that we will not dive into details about all of these functions and concepts, but we will see all of them it in the [Linux kernel memory manager](https://0xax.gitbook.io/linux-insides/summary/mm) chapter. That's all. Now we can look on the `scheduler`. @@ -318,7 +318,7 @@ The `Completely Fair Scheduler` supports following `normal` or in other words `n The `SCHED_NORMAL` is used for the most normal applications, the amount of cpu each process consumes is mostly determined by the [nice](http://en.wikipedia.org/wiki/Nice_%28Unix%29) value, the `SCHED_BATCH` used for the 100% non-interactive tasks and the `SCHED_IDLE` runs tasks only when the processor has no task to run besides this task. -The `real-time` policies are also supported for the time-critical applications: `SCHED_FIFO` and `SCHED_RR`. If you've read something about the Linux kernel scheduler, you can know that it is modular. That means it supports different algorithms to schedule different types of processes. Usually this modularity is called `scheduler classes`. These modules encapsulate scheduling policy details and are handled by the scheduler core without knowing too much about them. +The `real-time` policies are also supported for the time-critical applications: `SCHED_FIFO` and `SCHED_RR`. If you've read something about the Linux kernel scheduler, you can know that it is modular. That means it supports different algorithms to schedule different types of processes. Usually this modularity is called `scheduler classes`. These modules encapsulate scheduling policy details and are handled by the scheduler core without knowing too much about them. Now let's get back to the our code and look on the two configuration options: `CONFIG_FAIR_GROUP_SCHED` and `CONFIG_RT_GROUP_SCHED`. The smallest unit that the scheduler works with is an individual task or thread. However, a process is not the only type of entity that the scheduler can operate with. Both of these options provide support for group scheduling. The first option provides support for group scheduling with the `completely fair scheduler` policies and the second with the `real-time` policies respectively. @@ -340,11 +340,11 @@ The first is for case when scheduling of task groups is enabled with `completely * scheduler entity structure; * `runqueue`. -After we have calculated size, we allocate a space with the `kzalloc` function and set pointers of `sched_entity` and `runquques` there: +After we have calculated size, we allocate a space with the `kzalloc` function and set pointers of `sched_entity` and `runqueues` there: ```C ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); - + #ifdef CONFIG_FAIR_GROUP_SCHED root_task_group.se = (struct sched_entity **)ptr; ptr += nr_cpu_ids * sizeof(void **); @@ -396,10 +396,10 @@ All groups have to be able to rely on the amount of CPU time. The two following The first represents a period and the second represents quantum that is allocated for `real-time` tasks during `sched_rt_period_us`. You may see global values of these parameters in the: ``` -$ cat /proc/sys/kernel/sched_rt_period_us +$ cat /proc/sys/kernel/sched_rt_period_us 1000000 -$ cat /proc/sys/kernel/sched_rt_runtime_us +$ cat /proc/sys/kernel/sched_rt_runtime_us 950000 ``` @@ -415,7 +415,7 @@ That's all with the bandwiths of `real-time` and `deadline` tasks and in the nex The real-time scheduler requires global resources to make scheduling decision. But unfortunately scalability bottlenecks appear as the number of CPUs increase. The concept of `root domains` was introduced for improving scalability and avoid such bottlenecks. Instead of bypassing over all `run queues`, the scheduler gets information about a CPU where/from to push/pull a `real-time` task from the `root_domain` structure. This structure is defined in the [kernel/sched/sched.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/sched/sched.h) kernel header file and just keeps track of CPUs that can be used to push or pull a process. -After `root domain` initialization, we make initialization of the `bandwidth` for the `real-time` tasks of the `root task group` as we did the same above: +After `root domain` initialization, we make initialization of the `bandwidth` for the `real-time` tasks of the `root task group` as we did the same above: ```C #ifdef CONFIG_RT_GROUP_SCHED init_rt_bandwidth(&root_task_group.rt_bandwidth, @@ -499,7 +499,7 @@ struct task_struct { } ``` -The first one is `dynamic priority` which can't be changed during lifetime of a process based on its static priority and interactivity of the process. The `static_prio` contains initial priority most likely well-known to you `nice value`. This value does not changed by the kernel if a user will not change it. The last one is `normal_priority` based on the value of the `static_prio` too, but also it depends on the scheduling policy of a process. +The first one is `dynamic priority` which can't be changed during lifetime of a process based on its static priority and interactivity of the process. The `static_prio` contains initial priority most likely well-known to you `nice value`. This value is not changed by the kernel if a user does not change it. The last one is `normal_priority` based on the value of the `static_prio` too, but also it depends on the scheduling policy of a process. So the main goal of the `set_load_weight` function is to initialize `load_weight` fields for the `init` task: @@ -541,12 +541,12 @@ The last two steps of the `sched_init` function is to initialization of schedule scheduler_running = 1; ``` -That's all. Linux kernel scheduler is initialized. Of course, we have skipped many different details and explanations here, because we need to know and understand how different concepts (like process and process groups, runqueue, rcu, etc.) works in the linux kernel , but we took a short look on the scheduler initialization process. We will look all other details in the separate part which will be fully dedicated to the scheduler. +That's all. Linux kernel scheduler is initialized. Of course, we have skipped many different details and explanations here, because we need to know and understand how different concepts (like process and process groups, runqueue, rcu, etc.) works in the Linux kernel , but we took a short look on the scheduler initialization process. We will look all other details in the separate part which will be fully dedicated to the scheduler. Conclusion -------------------------------------------------------------------------------- -It is the end of the eighth part about the linux kernel initialization process. In this part, we looked on the initialization process of the scheduler and we will continue in the next part to dive in the linux kernel initialization process and will see initialization of the [RCU](http://en.wikipedia.org/wiki/Read-copy-update) and many other initialization stuff in the next part. +It is the end of the eighth part about the Linux kernel initialization process. In this part, we looked on the initialization process of the scheduler and we will continue in the next part to dive in the linux kernel initialization process and will see initialization of the [RCU](http://en.wikipedia.org/wiki/Read-copy-update) and many other initialization stuff in the next part. If you have any questions or suggestions write me a comment or ping me at [twitter](https://twitter.com/0xAX). diff --git a/Initialization/linux-initialization-9.md b/Initialization/linux-initialization-9.md index 0684a57a..3ee00077 100644 --- a/Initialization/linux-initialization-9.md +++ b/Initialization/linux-initialization-9.md @@ -4,7 +4,7 @@ Kernel initialization. Part 9. RCU initialization ================================================================================ -This is ninth part of the [Linux Kernel initialization process](https://0xax.gitbook.io/linux-insides/summary/initialization) and in the previous part we stopped at the [scheduler initialization](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-8). In this part we will continue to dive to the linux kernel initialization process and the main purpose of this part will be to learn about initialization of the [RCU](http://en.wikipedia.org/wiki/Read-copy-update). We can see that the next step in the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c) after the `sched_init` is the call of the `preempt_disable`. There are two macros: +This is ninth part of the [Linux Kernel initialization process](https://0xax.gitbook.io/linux-insides/summary/initialization) and in the previous part we stopped at the [scheduler initialization](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-8). In this part we will continue to dive to the Linux kernel initialization process and the main purpose of this part will be to learn about initialization of the [RCU](http://en.wikipedia.org/wiki/Read-copy-update). We can see that the next step in the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c) after the `sched_init` is the call of the `preempt_disable`. There are two macros: * `preempt_disable` * `preempt_enable` @@ -71,7 +71,7 @@ That's all. Preemption is disabled and we can go ahead. Initialization of the integer ID management -------------------------------------------------------------------------------- -In the next step we can see the call of the `idr_init_cache` function which defined in the [lib/idr.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/lib/idr.c). The `idr` library is used in a various [places](http://lxr.free-electrons.com/ident?i=idr_find) in the linux kernel to manage assigning integer `IDs` to objects and looking up objects by id. +In the next step we can see the call of the `idr_init_cache` function which defined in the [lib/idr.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/lib/idr.c). The `idr` library is used in a various [places](http://lxr.free-electrons.com/ident?i=idr_find) in the Linux kernel to manage assigning integer `IDs` to objects and looking up objects by id. Let's look on the implementation of the `idr_init_cache` function: @@ -120,14 +120,14 @@ More about integer ID management you can read [here](https://lwn.net/Articles/10 RCU initialization -------------------------------------------------------------------------------- -The next step is [RCU](http://en.wikipedia.org/wiki/Read-copy-update) initialization with the `rcu_init` function and it's implementation depends on two kernel configuration options: +The next step is [RCU](http://en.wikipedia.org/wiki/Read-copy-update) initialization with the `rcu_init` function and its implementation depends on two kernel configuration options: * `CONFIG_TINY_RCU` * `CONFIG_TREE_RCU` In the first case `rcu_init` will be in the [kernel/rcu/tiny.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/rcu/tiny.c) and in the second case it will be defined in the [kernel/rcu/tree.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/rcu/tree.c). We will see the implementation of the `tree rcu`, but first of all about the `RCU` in general. -`RCU` or read-copy update is a scalable high-performance synchronization mechanism implemented in the Linux kernel. On the early stage the linux kernel provided support and environment for the concurrently running applications, but all execution was serialized in the kernel using a single global lock. In our days linux kernel has no single global lock, but provides different mechanisms including [lock-free data structures](http://en.wikipedia.org/wiki/Concurrent_data_structure), [percpu](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-1) data structures and other. One of these mechanisms is - the `read-copy update`. The `RCU` technique is designed for rarely-modified data structures. The idea of the `RCU` is simple. For example we have a rarely-modified data structure. If somebody wants to change this data structure, we make a copy of this data structure and make all changes in the copy. In the same time all other users of the data structure use old version of it. Next, we need to choose safe moment when original version of the data structure will have no users and update it with the modified copy. +`RCU` or read-copy update is a scalable high-performance synchronization mechanism implemented in the Linux kernel. On the early stage the Linux kernel provided support and environment for the concurrently running applications, but all execution was serialized in the kernel using a single global lock. In our days linux kernel has no single global lock, but provides different mechanisms including [lock-free data structures](http://en.wikipedia.org/wiki/Concurrent_data_structure), [percpu](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-1) data structures and other. One of these mechanisms is - the `read-copy update`. The `RCU` technique is designed for rarely-modified data structures. The idea of the `RCU` is simple. For example we have a rarely-modified data structure. If somebody wants to change this data structure, we make a copy of this data structure and make all changes in the copy. In the same time all other users of the data structure use old version of it. Next, we need to choose safe moment when original version of the data structure will have no users and update it with the modified copy. Of course this description of the `RCU` is very simplified. To understand some details about `RCU`, first of all we need to learn some terminology. Data readers in the `RCU` executed in the [critical section](http://en.wikipedia.org/wiki/Critical_section). Every time when data reader get to the critical section, it calls the `rcu_read_lock`, and `rcu_read_unlock` on exit from the critical section. If the thread is not in the critical section, it will be in state which called - `quiescent state`. The moment when every thread is in the `quiescent state` called - `grace period`. If a thread wants to remove an element from the data structure, this occurs in two steps. First step is `removal` - atomically removes element from the data structure, but does not release the physical memory. After this thread-writer announces and waits until it is finished. From this moment, the removed element is available to the thread-readers. After the `grace period` finished, the second step of the element removal will be started, it just removes the element from the physical memory. @@ -292,7 +292,7 @@ extern struct rcu_state rcu_bh_state; DECLARE_PER_CPU(struct rcu_data, rcu_bh_data); ``` -About this states you can read [here](http://lwn.net/Articles/264090/). As I wrote above we need to initialize `rcu_state` structures and `rcu_init_one` function will help us with it. After the `rcu_state` initialization, we can see the call of the ` __rcu_init_preempt` which depends on the `CONFIG_PREEMPT_RCU` kernel configuration option. It does the same as previous functions - initialization of the `rcu_preempt_state` structure with the `rcu_init_one` function which has `rcu_state` type. After this, in the `rcu_init`, we can see the call of the: +About these states you can read [here](http://lwn.net/Articles/264090/). As I wrote above we need to initialize `rcu_state` structures and `rcu_init_one` function will help us with it. After the `rcu_state` initialization, we can see the call of the ` __rcu_init_preempt` which depends on the `CONFIG_PREEMPT_RCU` kernel configuration option. It does the same as previous functions - initialization of the `rcu_preempt_state` structure with the `rcu_init_one` function which has `rcu_state` type. After this, in the `rcu_init`, we can see the call of the: ```C open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); @@ -370,13 +370,13 @@ That's all. We saw initialization process of the `RCU` subsystem. As I wrote abo Rest of the initialization process -------------------------------------------------------------------------------- -Ok, we already passed the main theme of this part which is `RCU` initialization, but it is not the end of the linux kernel initialization process. In the last paragraph of this theme we will see a couple of functions which work in the initialization time, but we will not dive into deep details around this function for different reasons. Some reasons not to dive into details are following: +Ok, we already passed the main theme of this part which is `RCU` initialization, but it is not the end of the Linux kernel initialization process. In the last paragraph of this theme we will see a couple of functions which work in the initialization time, but we will not dive into deep details around this function for different reasons. Some reasons not to dive into details are following: * They are not very important for the generic kernel initialization process and depend on the different kernel configuration; * They have the character of debugging and not important for now; * We will see many of this stuff in the separate parts/chapters. -After we initialized `RCU`, the next step which you can see in the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c) is the - `trace_init` function. As you can understand from its name, this function initialize [tracing](http://en.wikipedia.org/wiki/Tracing_%28software%29) subsystem. You can read more about linux kernel trace system - [here](http://elinux.org/Kernel_Trace_Systems). +After we initialized `RCU`, the next step which you can see in the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c) is the - `trace_init` function. As you can understand from its name, this function initialize [tracing](http://en.wikipedia.org/wiki/Tracing_%28software%29) subsystem. You can read more about Linux kernel trace system - [here](http://elinux.org/Kernel_Trace_Systems). After the `trace_init`, we can see the call of the `radix_tree_init`. If you are familiar with the different data structures, you can understand from the name of this function that it initializes kernel implementation of the [Radix tree](http://en.wikipedia.org/wiki/Radix_tree). This function is defined in the [lib/radix-tree.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/lib/radix-tree.c) and you can read more about it in the part about [Radix tree](https://0xax.gitbook.io/linux-insides/summary/datastructures/linux-datastructures-2). @@ -405,7 +405,7 @@ This is the end of the ninth part of the [linux kernel initialization process](h Conclusion -------------------------------------------------------------------------------- -It is the end of the ninth part about the linux kernel [initialization process](https://0xax.gitbook.io/linux-insides/summary/initialization). In this part, we looked on the initialization process of the `RCU` subsystem. In the next part we will continue to dive into linux kernel initialization process and I hope that we will finish with the `start_kernel` function and will go to the `rest_init` function from the same [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c) source code file and will see the start of the first process. +It is the end of the ninth part about the Linux kernel [initialization process](https://0xax.gitbook.io/linux-insides/summary/initialization). In this part, we looked on the initialization process of the `RCU` subsystem. In the next part we will continue to dive into linux kernel initialization process and I hope that we will finish with the `start_kernel` function and will go to the `rest_init` function from the same [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c) source code file and will see the start of the first process. If you have any questions or suggestions write me a comment or ping me at [twitter](https://twitter.com/0xAX). diff --git a/Interrupts/README.md b/Interrupts/README.md index 9710c984..e23b566e 100644 --- a/Interrupts/README.md +++ b/Interrupts/README.md @@ -1,6 +1,6 @@ # Interrupts and Interrupt Handling -In the following posts, we will cover interrupts and exceptions handling in the linux kernel. +In the following posts, we will cover interrupts and exceptions handling in the Linux kernel. * [Interrupts and Interrupt Handling. Part 1.](linux-interrupts-1.md) - describes interrupts and interrupt handling theory. * [Interrupts in the Linux Kernel](linux-interrupts-2.md) - describes stuffs related to interrupts and exceptions handling from the early stage. diff --git a/Interrupts/images/kernel.png b/Interrupts/images/kernel.png index 93a05238..f5475cc4 100644 Binary files a/Interrupts/images/kernel.png and b/Interrupts/images/kernel.png differ diff --git a/Interrupts/linux-interrupts-1.md b/Interrupts/linux-interrupts-1.md index 0c1c8c22..cb37b612 100644 --- a/Interrupts/linux-interrupts-1.md +++ b/Interrupts/linux-interrupts-1.md @@ -281,8 +281,7 @@ The `PAGE_SIZE` is `4096`-bytes and the `THREAD_SIZE_ORDER` depends on the `KASA #define IRQ_STACK_SIZE (PAGE_SIZE << IRQ_STACK_ORDER) ``` -Or `16384` bytes. The per-cpu interrupt stack is represented by the `irq_stack` struct and the `fixed_percpu_data` struct -in the Linux kernel for `x86_64`: +Or `16384` bytes. The per-cpu interrupt stack is represented by the `irq_stack` struct and the `fixed_percpu_data` struct in the Linux kernel for `x86_64`: ```C /* Per CPU interrupt stacks */ @@ -306,7 +305,7 @@ struct fixed_percpu_data { #endif ``` -The `irq_stack` struct contains a 16 kilobytes array. +The `irq_stack` struct contains a 16 kilobytes array. Also, you can see that the fixed\_percpu\_data contains two fields: * `gs_base` - The `gs` register always points to the bottom of the `fixed_percpu_data`. On the `x86_64`, the `gs` register is shared by per-cpu area and stack canary (more about `per-cpu` variables you can read in the special [part](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-1)). All per-cpu symbols are zero-based and the `gs` points to the base of the per-cpu area. You already know that [segmented memory model](http://en.wikipedia.org/wiki/Memory_segmentation) is abolished in the long mode, but we can set the base address for the two segment registers - `fs` and `gs` with the [Model specific registers](http://en.wikipedia.org/wiki/Model-specific_register) and these registers can be still be used as address registers. If you remember the first [part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-1) of the Linux kernel initialization process, you can remember that we have set the `gs` register: @@ -373,11 +372,11 @@ int irq_init_percpu_irqstack(unsigned int cpu) Here we go over all the CPUs one-by-one and setup the `hardirq_stack_ptr`. Where `map_irq_stack` is called to initialize the `hardirq_stack_ptr`, -to point onto the `irq_backing_store` of the current CPU with an offset of IRQ\_STACK\_SIZE, +to point onto the `irq_stack_backing_store` of the current CPU with an offset of IRQ\_STACK\_SIZE, either with guard pages or without when KASan is enabled. -After the initialization of the interrupt stack, we need to initialize the gs register within [arch/x86/kernel/cpu/common.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/cpu/common.c): +After the initialization of the interrupt stack, we need to initialize the gs register within [arch/x86/kernel/cpu/common.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/cpu/common.c): ```C void load_percpu_segment(int cpu) @@ -406,7 +405,7 @@ and as we already know the `gs` register points to the bottom of the interrupt s Here we can see the `wrmsr` instruction, which loads the data from `edx:eax` into the [Model specific register](http://en.wikipedia.org/wiki/Model-specific_register) pointed by the `ecx` register. In our case the model specific register is `MSR_GS_BASE`, which contains the base address of the memory segment pointed to by the `gs` register. `edx:eax` points to the address of the `initial_gs,` which is the base address of our `fixed_percpu_data`. -We already know that `x86_64` has a feature called `Interrupt Stack Table` or `IST` and this feature provides the ability to switch to a new stack for events like a non-maskable interrupt, double fault etc. There can be up to seven `IST` entries per-cpu. Some of them are: +We already know that `x86_64` has a feature called `Interrupt Stack Table` or `IST` and this feature provides the ability to switch to a new stack for events like a non-maskable interrupt, double fault, etc. There can be up to seven `IST` entries per-cpu. Some of them are: * `DOUBLEFAULT_STACK` * `NMI_STACK` @@ -433,7 +432,7 @@ static const __initconst struct idt_data def_idts[] = { INTG(X86_TRAP_DF, double_fault), ``` -where `nmi` and `double_fault` are entry points created at [arch/x86/kernel/entry_64.S](https://github.com/torvalds/linux/blob/master/arch/x86/entry/entry_64.S): +where `nmi` and `double_fault` are entry points created at [arch/x86/kernel/entry_64.S](https://github.com/torvalds/linux/blob/master/arch/x86/entry/entry_64.S): ```assembly idtentry double_fault do_double_fault has_error_code=1 paranoid=2 read_cr2=1 diff --git a/Interrupts/linux-interrupts-10.md b/Interrupts/linux-interrupts-10.md index 1641f56c..4c55d295 100644 --- a/Interrupts/linux-interrupts-10.md +++ b/Interrupts/linux-interrupts-10.md @@ -6,7 +6,7 @@ Last part This is the tenth part of the [chapter](https://0xax.gitbook.io/linux-insides/summary/interrupts) about interrupts and interrupt handling in the Linux kernel and in the previous [part](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-9) we saw a little about deferred interrupts and related concepts like `softirq`, `tasklet` and `workqeue`. In this part we will continue to dive into this theme and now it's time to look at real hardware driver. -Let's consider serial driver of the [StrongARM** SA-110/21285 Evaluation Board](http://netwinder.osuosl.org/pub/netwinder/docs/intel/datashts/27813501.pdf) board for example and will look how this driver requests an [IRQ](https://en.wikipedia.org/wiki/Interrupt_request_%28PC_architecture%29) line, +Let's consider serial driver of the [StrongARM** SA-110/21285 Evaluation Board](http://netwinder.osuosl.org/pub/netwinder/docs/intel/datashts/27813501.pdf) board for example and will look how this driver requests an [IRQ](https://en.wikipedia.org/wiki/Interrupt_request_%28PC_architecture%29) line, what happens when an interrupt is triggered and etc. The source code of this driver is placed in the [drivers/tty/serial/21285.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/drivers/tty/serial/21285.c) source code file. Ok, we have source code, let's start. Initialization of a kernel module @@ -111,7 +111,7 @@ if (ret == 0) return ret; ``` -That's all. Our driver is initialized. When an `uart` port will be opened with the call of the `uart_open` function from the [drivers/tty/serial/serial_core.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/drivers/tty/serial/serial_core.c), it will call the `uart_startup` function to start up the serial port. This function will call the `startup` function that is part of the `uart_ops` structure. Each `uart` driver has the definition of this structure, in our case it is: +That's all. Our driver is initialized. When an `uart` port is opened with the call of the `uart_open` function from the [drivers/tty/serial/serial_core.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/drivers/tty/serial/serial_core.c), it will call the `uart_startup` function to start up the serial port. This function will call the `startup` function that is part of the `uart_ops` structure. Each `uart` driver has the definition of this structure, in our case it is: ```C static struct uart_ops serial21285_ops = { @@ -243,7 +243,7 @@ if (!irq_settings_can_request(desc) || WARN_ON(irq_settings_is_per_cpu_devid(des return -EINVAL; ``` -and exit with the `-EINVAL`otherways. After this we check the given interrupt handler. If it was not passed to the `request_irq` function, we check the `thread_fn`. If both handlers are `NULL`, we return with the `-EINVAL`. If an interrupt handler was not passed to the `request_irq` function, but the `thread_fn` is not null, we set handler to the `irq_default_primary_handler`: +and exit with the `-EINVAL` otherwise. After this we check the given interrupt handler. If it was not passed to the `request_irq` function, we check the `thread_fn`. If both handlers are `NULL`, we return with the `-EINVAL`. If an interrupt handler was not passed to the `request_irq` function, but the `thread_fn` is not null, we set handler to the `irq_default_primary_handler`: ```C if (!handler) { @@ -296,12 +296,12 @@ if (new->thread_fn && !nested) { } ``` -And fill the rest of the given interrupt descriptor fields in the end. So, our `16` and `17` interrupt request lines are registered and the `serial21285_rx_chars` and `serial21285_tx_chars` functions will be invoked when an interrupt controller will get event related to these interrupts. Now let's look at what happens when an interrupt occurs. +And fill the rest of the given interrupt descriptor fields in the end. So, our `16` and `17` interrupt request lines are registered and the `serial21285_rx_chars` and `serial21285_tx_chars` functions will be invoked when an interrupt controller will get event related to these interrupts. Now let's look at what happens when an interrupt occurs. Prepare to handle an interrupt -------------------------------------------------------------------------------- -In the previous paragraph we saw the requesting of the irq line for the given interrupt descriptor and registration of the `irqaction` structure for the given interrupt. We already know that when an interrupt event occurs, an interrupt controller notifies the processor about this event and processor tries to find appropriate interrupt gate for this interrupt. If you have read the eighth [part](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-8) of this chapter, you may remember the `native_init_IRQ` function. This function makes initialization of the local [APIC](https://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller). The following part of this function is the most interesting part for us right now: +In the previous paragraph we saw the requesting of the irq line for the given interrupt descriptor and registration of the `irqaction` structure for the given interrupt. We already know that when an interrupt event occurs, an interrupt controller notifies the processor about this event and processor tries to find appropriate interrupt gate for this interrupt. If you have read the eighth [part](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-8) of this chapter, you may remember the `native_init_IRQ` function. This function makes initialization of the local [APIC](https://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller). The following part of this function is the most interesting part for us right now: ```C for_each_clear_bit_from(i, used_vectors, first_system_vector) { @@ -398,7 +398,7 @@ static inline void generic_handle_irq_desc(unsigned int irq, struct irq_desc *de } ``` -But stop... What is it `handle_irq` and why do we call our interrupt handler from the interrupt descriptor when we know that `irqaction` points to the actual interrupt handler? Actually the `irq_desc->handle_irq` is a high-level API for the calling interrupt handler routine. It setups during initialization of the [device tree](https://en.wikipedia.org/wiki/Device_tree) and [APIC](https://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller) initialization. The kernel selects correct function and call chain of the `irq->action(s)` there. In this way, the `serial21285_tx_chars` or the `serial21285_rx_chars` function will be executed after an interrupt will occur. +But stop... What is it `handle_irq` and why do we call our interrupt handler from the interrupt descriptor when we know that `irqaction` points to the actual interrupt handler? Actually the `irq_desc->handle_irq` is a high-level API for the calling interrupt handler routine. It is setup during initialization of the [device tree](https://en.wikipedia.org/wiki/Device_tree) and [APIC](https://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller) initialization. The kernel selects correct function and call chain of the `irq->action(s)` there. In this way, the `serial21285_tx_chars` or the `serial21285_rx_chars` function will be executed after an interrupt occurs. In the end of the `do_IRQ` function we call the `irq_exit` function that will exit from the interrupt context, the `set_irq_regs` with the old userspace registers and return: @@ -413,7 +413,7 @@ We already know that when an `IRQ` finishes its work, deferred interrupts will b Exit from interrupt -------------------------------------------------------------------------------- -Ok, the interrupt handler finished its execution and now we must return from the interrupt. When the work of the `do_IRQ` function will be finished, we will return back to the assembler code in the [arch/x86/entry/entry_64.S](https://github.com/torvalds/linux/blob/master/arch/x86/entry/entry_64.S) to the `ret_from_intr` label. First of all we disable interrupts with the `DISABLE_INTERRUPTS` macro that expands to the `cli` instruction and decreases value of the `irq_count` [per-cpu](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-1) variable. Remember, this variable had value - `1`, when we were in interrupt context: +Ok, the interrupt handler finished its execution and now we must return from the interrupt. When the work of the `do_IRQ` function is finished, we will return back to the assembler code in the [arch/x86/entry/entry_64.S](https://github.com/torvalds/linux/blob/master/arch/x86/entry/entry_64.S) to the `ret_from_intr` label. First of all we disable interrupts with the `DISABLE_INTERRUPTS` macro that expands to the `cli` instruction and decreases value of the `irq_count` [per-cpu](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-1) variable. Remember, this variable had value - `1`, when we were in interrupt context: ```assembly DISABLE_INTERRUPTS(CLBR_NONE) @@ -462,8 +462,8 @@ Links * [IRQ](https://en.wikipedia.org/wiki/Interrupt_request_%28PC_architecture%29) * [module](https://en.wikipedia.org/wiki/Loadable_kernel_module) * [initcall](http://kernelnewbies.org/Documents/InitcallMechanism) -* [uart](https://en.wikipedia.org/wiki/Universal_asynchronous_receiver/transmitter) -* [ISA](https://en.wikipedia.org/wiki/Industry_Standard_Architecture) +* [uart](https://en.wikipedia.org/wiki/Universal_asynchronous_receiver/transmitter) +* [ISA](https://en.wikipedia.org/wiki/Industry_Standard_Architecture) * [memory management](https://0xax.gitbook.io/linux-insides/summary/mm) * [i2c](https://en.wikipedia.org/wiki/I%C2%B2C) * [APIC](https://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller) diff --git a/Interrupts/linux-interrupts-2.md b/Interrupts/linux-interrupts-2.md index fa683c02..ff2b611e 100644 --- a/Interrupts/linux-interrupts-2.md +++ b/Interrupts/linux-interrupts-2.md @@ -196,7 +196,7 @@ for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) load_idt((const struct desc_ptr *)&idt_descr); ``` -AS you can see it has only one difference in the name of the array of the interrupts handlers entry points. Now it is `early_idt_handler_array`: +As you can see it has only one difference in the name of the array of the interrupts handlers entry points. Now it is `early_idt_handler_array`: ```C extern const char early_idt_handler_array[NUM_EXCEPTION_VECTORS][EARLY_IDT_HANDLER_SIZE]; @@ -209,7 +209,7 @@ where `NUM_EXCEPTION_VECTORS` and `EARLY_IDT_HANDLER_SIZE` are defined as: #define EARLY_IDT_HANDLER_SIZE 9 ``` -So, the `early_idt_handler_array` is an array of the interrupts handlers entry points and contains one entry point on every nine bytes. You can remember that previous `early_idt_handlers` was defined in the [arch/x86/kernel/head_64.S](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/head_64.S). The `early_idt_handler_array` is defined in the same source code file too: +So, the `early_idt_handler_array` is an array of the interrupts handlers entry points and contains one entry point on every nine bytes. You can remember that previous `early_idt_handlers` was defined in the [arch/x86/kernel/head_64.S](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/head_64.S). The `early_idt_handler_array` is defined in the same source code file too: ```assembly ENTRY(early_idt_handler_array) @@ -357,7 +357,7 @@ $ sudo cat /proc/lockdep redundant softirq offs: 0 ``` -Ok, now we know a little about tracing, but more info will be in the separate part about `lockdep` and `tracing`. You can see that the both `local_disable_irq` macros have the same part - `raw_local_irq_disable`. This macro defined in the [arch/x86/include/asm/irqflags.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/irqflags.h) and expands to the call of the: +Ok, now we know a little about tracing, but more info will be in the separate part about `lockdep` and `tracing`. You can see that the both `local_irq_disable` macros have the same part - `raw_local_irq_disable`. This macro defined in the [arch/x86/include/asm/irqflags.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/irqflags.h) and expands to the call of the: ```C static inline void native_irq_disable(void) @@ -417,7 +417,7 @@ Here we can see calls of three different functions: * `set_system_intr_gate_ist` * `set_intr_gate` -All of these functions defined in the [arch/x86/include/asm/desc.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/desc.h) and do the similar thing but not the same. The first `set_intr_gate_ist` function inserts new an interrupt gate in the `IDT`. Let's look on its implementation: +All of these functions defined in the [arch/x86/include/asm/desc.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/desc.h) and do the similar thing but not the same. The first `set_intr_gate_ist` function inserts a new interrupt gate in the `IDT`. Let's look on its implementation: ```C static inline void set_intr_gate_ist(int n, void *addr, unsigned ist) @@ -441,7 +441,7 @@ static inline void _set_gate(int gate, unsigned type, void *addr, } ``` -Here we start from the `pack_gate` function which takes clean `IDT` entry represented by the `gate_desc` structure and fills it with the base address and limit, [Interrupt Stack Table](https://www.kernel.org/doc/Documentation/x86/x86_64/kernel-stacks), [Privilege level](http://en.wikipedia.org/wiki/Privilege_level), type of an interrupt which can be one of the following values: +Here we start from the `pack_gate` function which takes clean `IDT` entry represented by the `gate_desc` structure and fills it with the base address and limit, [Interrupt Stack Table](https://www.kernel.org/doc/Documentation/x86/kernel-stacks), [Privilege level](http://en.wikipedia.org/wiki/Privilege_level), type of an interrupt which can be one of the following values: * `GATE_INTERRUPT` * `GATE_TRAP` @@ -494,7 +494,7 @@ static inline void set_system_intr_gate_ist(int n, void *addr, unsigned ist) } ``` -Do you see it? Look on the fourth parameter of the `_set_gate`. It is `0x3`. In the `set_intr_gate` it was `0x0`. We know that this parameter represent `DPL` or privilege level. We also know that `0` is the highest privilege level and `3` is the lowest.Now we know how `set_system_intr_gate_ist`, `set_intr_gate_ist`, `set_intr_gate` are work and we can return to the `early_trap_init` function. Let's look on it again: +Do you see it? Look on the fourth parameter of the `_set_gate`. It is `0x3`. In the `set_intr_gate` it was `0x0`. We know that this parameter represent `DPL` or privilege level. We also know that `0` is the highest privilege level and `3` is the lowest. Now we know how `set_system_intr_gate_ist`, `set_intr_gate_ist`, `set_intr_gate` work and we can return to the `early_trap_init` function. Let's look on it again: ```C set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK); @@ -537,6 +537,6 @@ Links * [Union type](http://en.wikipedia.org/wiki/Union_type) * [this_cpu_* operations](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/Documentation/this_cpu_ops.txt) * [vector number](http://en.wikipedia.org/wiki/Interrupt_vector_table) -* [Interrupt Stack Table](https://www.kernel.org/doc/Documentation/x86/x86_64/kernel-stacks) +* [Interrupt Stack Table](https://www.kernel.org/doc/Documentation/x86/kernel-stacks) * [Privilege level](http://en.wikipedia.org/wiki/Privilege_level) * [Previous part](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-1) diff --git a/Interrupts/linux-interrupts-3.md b/Interrupts/linux-interrupts-3.md index 11cb6e19..d358f2a4 100644 --- a/Interrupts/linux-interrupts-3.md +++ b/Interrupts/linux-interrupts-3.md @@ -4,9 +4,9 @@ Interrupts and Interrupt Handling. Part 3. Exception Handling -------------------------------------------------------------------------------- -This is the third part of the [chapter](https://0xax.gitbook.io/linux-insides/summary/interrupts) about an interrupts and an exceptions handling in the Linux kernel and in the previous [part](https://0xax.gitbook.io/linux-insides/summary/interrupts) we stopped at the `setup_arch` function from the [arch/x86/kernel/setup.c](https://github.com/torvalds/linux/blame/master/arch/x86/kernel/setup.c) source code file. +This is the third part of the [chapter](https://0xax.gitbook.io/linux-insides/summary/interrupts) about interrupts and an exceptions handling in the Linux kernel and in the previous [part](https://0xax.gitbook.io/linux-insides/summary/interrupts) we stopped at the `setup_arch` function from the [arch/x86/kernel/setup.c](https://github.com/torvalds/linux/blame/master/arch/x86/kernel/setup.c) source code file. -We already know that this function executes initialization of architecture-specific stuff. In our case the `setup_arch` function does [x86_64](https://en.wikipedia.org/wiki/X86-64) architecture related initializations. The `setup_arch` is big function, and in the previous part we stopped on the setting of the two exceptions handlers for the two following exceptions: +We already know that this function executes initialization of architecture-specific stuff. In our case the `setup_arch` function does [x86_64](https://en.wikipedia.org/wiki/X86-64) architecture related initializations. The `setup_arch` is big function, and in the previous part we stopped on the setting of the two exception handlers for the two following exceptions: * `#DB` - debug exception, transfers control from the interrupted process to the debug handler; * `#BP` - breakpoint exception, caused by the `int 3` instruction. @@ -24,18 +24,18 @@ void __init early_trap_init(void) } ``` -from the [arch/x86/kernel/traps.c](https://github.com/torvalds/linux/tree/master/arch/x86/kernel/traps.c). We already saw implementation of the `set_intr_gate_ist` and `set_system_intr_gate_ist` functions in the previous part and now we will look on the implementation of these two exceptions handlers. +from the [arch/x86/kernel/traps.c](https://github.com/torvalds/linux/tree/master/arch/x86/kernel/traps.c). We already saw implementation of the `set_intr_gate_ist` and `set_system_intr_gate_ist` functions in the previous part and now we will look on the implementation of these two exception handlers. Debug and Breakpoint exceptions -------------------------------------------------------------------------------- -Ok, we setup exception handlers in the `early_trap_init` function for the `#DB` and `#BP` exceptions and now time is to consider their implementations. But before we will do this, first of all let's look on details of these exceptions. +Ok, we setup exception handlers in the `early_trap_init` function for the `#DB` and `#BP` exceptions and now is time to consider their implementations. But before we will do this, first of all let's look on details of these exceptions. The first exceptions - `#DB` or `debug` exception occurs when a debug event occurs. For example - attempt to change the contents of a [debug register](http://en.wikipedia.org/wiki/X86_debug_register). Debug registers are special registers that were presented in `x86` processors starting from the [Intel 80386](http://en.wikipedia.org/wiki/Intel_80386) processor and as you can understand from name of this CPU extension, main purpose of these registers is debugging. These registers allow to set breakpoints on the code and read or write data to trace it. Debug registers may be accessed only in the privileged mode and an attempt to read or write the debug registers when executing at any other privilege level causes a [general protection fault](https://en.wikipedia.org/wiki/General_protection_fault) exception. That's why we have used `set_intr_gate_ist` for the `#DB` exception, but not the `set_system_intr_gate_ist`. -The verctor number of the `#DB` exceptions is `1` (we pass it as `X86_TRAP_DB`) and as we may read in specification, this exception has no error code: +The vector number of the `#DB` exceptions is `1` (we pass it as `X86_TRAP_DB`) and as we may read in specification, this exception has no error code: ``` +-----------------------------------------------------+ @@ -65,6 +65,7 @@ If we will compile and run this program, we will see following output: ``` $ gcc breakpoint.c -o breakpoint +$ ./breakpoint i equal to: 0 Trace/breakpoint trap ``` @@ -77,7 +78,7 @@ $ gdb breakpoint ... ... (gdb) run -Starting program: /home/alex/breakpoints +Starting program: /home/alex/breakpoints i equal to: 0 Program received signal SIGTRAP, Trace/breakpoint trap. @@ -112,7 +113,7 @@ As you may note before, the `set_intr_gate_ist` and `set_system_intr_gate_ist` f * `debug`; * `int3`. -You will not find these functions in the C code. all of that could be found in the kernel's `*.c/*.h` files only definition of these functions which are located in the [arch/x86/include/asm/traps.h](https://github.com/torvalds/linux/tree/master/arch/x86/include/asm/traps.h) kernel header file: +You will not find these functions in the C code. All of that could be found in the kernel's `*.c/*.h` files only definition of these functions which are located in the [arch/x86/include/asm/traps.h](https://github.com/torvalds/linux/tree/master/arch/x86/include/asm/traps.h) kernel header file: ```C asmlinkage void debug(void); @@ -138,7 +139,7 @@ and idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK ``` -Each exception handler may be consists from two parts. The first part is generic part and it is the same for all exception handlers. An exception handler should to save [general purpose registers](https://en.wikipedia.org/wiki/Processor_register) on the stack, switch to kernel stack if an exception came from userspace and transfer control to the second part of an exception handler. The second part of an exception handler does certain work depends on certain exception. For example page fault exception handler should find virtual page for given address, invalid opcode exception handler should send `SIGILL` [signal](https://en.wikipedia.org/wiki/Unix_signal) and etc. +Each exception handler may consists of two parts. The first part is generic part and it is the same for all exception handlers. An exception handler should to save [general purpose registers](https://en.wikipedia.org/wiki/Processor_register) on the stack, switch to kernel stack if an exception came from userspace and transfer control to the second part of an exception handler. The second part of an exception handler does certain work depends on certain exception. For example page fault exception handler should find virtual page for given address, invalid opcode exception handler should send `SIGILL` [signal](https://en.wikipedia.org/wiki/Unix_signal) and etc. As we just saw, an exception handler starts from definition of the `idtentry` macro from the [arch/x86/entry/entry_64.S](https://github.com/torvalds/linux/blob/master/arch/x86/entry/entry_64.S) assembly source code file, so let's look at implementation of this macro. As we may see, the `idtentry` macro takes five arguments: @@ -193,7 +194,7 @@ If we will look at these definitions, we may know that compiler will generate tw But it is not only fake error-code. Moreover the `-1` also represents invalid system call number, so that the system call restart logic will not be triggered. -The last two parameters of the `idtentry` macro `shift_ist` and `paranoid` allow to know do an exception handler runned at stack from `Interrupt Stack Table` or not. You already may know that each kernel thread in the system has own stack. In addition to these stacks, there are some specialized stacks associated with each processor in the system. One of these stacks is - exception stack. The [x86_64](https://en.wikipedia.org/wiki/X86-64) architecture provides special feature which is called - `Interrupt Stack Table`. This feature allows to switch to a new stack for designated events such as an atomic exceptions like `double fault` and etc. So the `shift_ist` parameter allows us to know do we need to switch on `IST` stack for an exception handler or not. +The last two parameters of the `idtentry` macro `shift_ist` and `paranoid` allow to know do an exception handler runned at stack from `Interrupt Stack Table` or not. You already may know that each kernel thread in the system has its own stack. In addition to these stacks, there are some specialized stacks associated with each processor in the system. One of these stacks is - exception stack. The [x86_64](https://en.wikipedia.org/wiki/X86-64) architecture provides special feature which is called - `Interrupt Stack Table`. This feature allows to switch to a new stack for designated events such as an atomic exceptions like `double fault`, etc. So the `shift_ist` parameter allows us to know do we need to switch on `IST` stack for an exception handler or not. The second parameter - `paranoid` defines the method which helps us to know did we come from userspace or not to an exception handler. The easiest way to determine this is to via `CPL` or `Current Privilege Level` in `CS` segment register. If it is equal to `3`, we came from userspace, if zero we came from kernel space: @@ -213,7 +214,7 @@ But unfortunately this method does not give a 100% guarantee. As described in th > stack but before we executed SWAPGS, then the only safe way to check > for GS is the slower method: the RDMSR. -In other words for example `NMI` could happen inside the critical section of a [swapgs](http://www.felixcloutier.com/x86/SWAPGS.html) instruction. In this way we should check value of the `MSR_GS_BASE` [model specific register](https://en.wikipedia.org/wiki/Model-specific_register) which stores pointer to the start of per-cpu area. So to check did we come from userspace or not, we should to check value of the `MSR_GS_BASE` model specific register and if it is negative we came from kernel space, in other way we came from userspace: +In other words for example `NMI` could happen inside the critical section of a [swapgs](http://www.felixcloutier.com/x86/SWAPGS.html) instruction. In this way we should check value of the `MSR_GS_BASE` [model specific register](https://en.wikipedia.org/wiki/Model-specific_register) which stores pointer to the start of per-cpu area. So to check if we did come from userspace or not, we should to check value of the `MSR_GS_BASE` model specific register and if it is negative we came from kernel space, in other way we came from userspace: ```assembly movl $MSR_GS_BASE,%ecx @@ -224,7 +225,7 @@ js 1f In first two lines of code we read value of the `MSR_GS_BASE` model specific register into `edx:eax` pair. We can't set negative value to the `gs` from userspace. But from other side we know that direct mapping of the physical memory starts from the `0xffff880000000000` virtual address. In this way, `MSR_GS_BASE` will contain an address from `0xffff880000000000` to `0xffffc7ffffffffff`. After the `rdmsr` instruction will be executed, the smallest possible value in the `%edx` register will be - `0xffff8800` which is `-30720` in unsigned 4 bytes. That's why kernel space `gs` which points to start of `per-cpu` area will contain negative value. -After we pushed fake error code on the stack, we should allocate space for general purpose registers with: +After we push fake error code on the stack, we should allocate space for general purpose registers with: ```assembly ALLOC_PT_GPREGS_ON_STACK @@ -370,7 +371,7 @@ asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs) } ``` -This function takes the result of the `task_ptr_regs` macro which is defined in the [arch/x86/include/asm/processor.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/processor.h) header file, stores it in the stack pointer and return it. The `task_ptr_regs` macro expands to the address of `thread.sp0` which represents pointer to the normal kernel stack: +This function takes the result of the `task_ptr_regs` macro which is defined in the [arch/x86/include/asm/processor.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/processor.h) header file, stores it in the stack pointer and returns it. The `task_ptr_regs` macro expands to the address of `thread.sp0` which represents pointer to the normal kernel stack: ```C #define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1) @@ -403,7 +404,7 @@ as it will be passed as first parameter of secondary exception handler. .endif ``` -Additionally you may see that we zeroed the `%esi` register above in a case if an exception does not provide error code. +Additionally you may see that we zeroed the `%esi` register above in a case if an exception does not provide error code. In the end we just call secondary exception handler: @@ -423,7 +424,7 @@ will be for `debug` exception and: dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code); ``` -will be for `int 3` exception. In this part we will not see implementations of secondary handlers, because of they are very specific, but will see some of them in one of next parts. +will be for `int 3` exception. In this part we will not see implementations of secondary handlers, because they are very specific, but will see some of them in one of next parts. We just considered first case when an exception occurred in userspace. Let's consider last two. @@ -461,7 +462,7 @@ movq %rsp, %rdi .endif ``` -The last step before a secondary handler of an exception will be called is cleanup of new `IST` stack fram: +The last step before a secondary handler of an exception will be called is cleanup of new `IST` stack frame: ```assembly .if \shift_ist != -1 diff --git a/Interrupts/linux-interrupts-4.md b/Interrupts/linux-interrupts-4.md index 22a37a82..0aacd77c 100644 --- a/Interrupts/linux-interrupts-4.md +++ b/Interrupts/linux-interrupts-4.md @@ -99,7 +99,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) } ``` -This register contains a linear address which caused `page fault`. In the next step we make a call of the `exception_enter` function from the [include/linux/context_tracking.h](https://github.com/torvalds/linux/blob/master/include/linux/context_tracking.h). The `exception_enter` and `exception_exit` are functions from context tracking subsystem in the Linux kernel used by the [RCU](https://en.wikipedia.org/wiki/Read-copy-update) to remove its dependency on the timer tick while a processor runs in userspace. Almost in the every exception handler we will see similar code: +This register contains a linear address which caused `page fault`. In the next step we make a call of the `exception_enter` function from the [include/linux/context_tracking.h](https://github.com/torvalds/linux/blob/master/include/linux/context_tracking.h). The `exception_enter` and `exception_exit` are functions from context tracking subsystem in the Linux kernel used by the [RCU](https://en.wikipedia.org/wiki/Read-copy-update) to remove its dependency on the timer tick while a processor runs in userspace. Almost in every exception handler we will see similar code: ```C enum ctx_state prev_state; @@ -182,7 +182,7 @@ or `0x00007ffffffff000`. Pay attention on `unlikely` macro. There are two macros #define unlikely(x) __builtin_expect(!!(x), 0) ``` -You can [often](http://lxr.free-electrons.com/ident?i=unlikely) find these macros in the code of the Linux kernel. Main purpose of these macros is optimization. Sometimes this situation is that we need to check the condition of the code and we know that it will rarely be `true` or `false`. With these macros we can tell to the compiler about this. For example +You can [often](http://lxr.free-electrons.com/ident?i=unlikely) find these macros in the code of the Linux kernel. Main purpose of these macros is optimization. Sometimes this situation is that we need to check the condition of the code and we know that it will rarely be `true` or `false`. With these macros we can tell to the compiler about this. For example ```C static int proc_root_readdir(struct file *file, struct dir_context *ctx) @@ -447,7 +447,7 @@ Links * [prefetchw](http://www.felixcloutier.com/x86/PREFETCHW.html) * [3DNow](https://en.wikipedia.org/?title=3DNow!) * [CPU caches](https://en.wikipedia.org/wiki/CPU_cache) -* [VFS](https://en.wikipedia.org/wiki/Virtual_file_system) +* [VFS](https://en.wikipedia.org/wiki/Virtual_file_system) * [Linux kernel memory management](https://0xax.gitbook.io/linux-insides/summary/mm) * [Fix-Mapped Addresses and ioremap](https://0xax.gitbook.io/linux-insides/summary/mm/linux-mm-2) * [Extended Industry Standard Architecture](https://en.wikipedia.org/wiki/Extended_Industry_Standard_Architecture) diff --git a/Interrupts/linux-interrupts-5.md b/Interrupts/linux-interrupts-5.md index 1a0826ca..55f54fab 100644 --- a/Interrupts/linux-interrupts-5.md +++ b/Interrupts/linux-interrupts-5.md @@ -7,21 +7,21 @@ Implementation of exception handlers This is the fifth part about an interrupts and exceptions handling in the Linux kernel and in the previous [part](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-4) we stopped on the setting of interrupt gates to the [Interrupt descriptor Table](https://en.wikipedia.org/wiki/Interrupt_descriptor_table). We did it in the `trap_init` function from the [arch/x86/kernel/traps.c](https://github.com/torvalds/linux/tree/master/arch/x86/kernel/traps.c) source code file. We saw only setting of these interrupt gates in the previous part and in the current part we will see implementation of the exception handlers for these gates. The preparation before an exception handler will be executed is in the [arch/x86/entry/entry_64.S](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/entry/entry_64.S) assembly file and occurs in the [idtentry](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/entry/entry_64.S#L820) macro that defines exceptions entry points: ```assembly -idtentry divide_error do_divide_error has_error_code=0 -idtentry overflow do_overflow has_error_code=0 -idtentry invalid_op do_invalid_op has_error_code=0 -idtentry bounds do_bounds has_error_code=0 -idtentry device_not_available do_device_not_available has_error_code=0 -idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0 -idtentry invalid_TSS do_invalid_TSS has_error_code=1 -idtentry segment_not_present do_segment_not_present has_error_code=1 -idtentry spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0 -idtentry coprocessor_error do_coprocessor_error has_error_code=0 -idtentry alignment_check do_alignment_check has_error_code=1 -idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0 -``` - -The `idtentry` macro does following preparation before an actual exception handler (`do_divide_error` for the `divide_error`, `do_overflow` for the `overflow` and etc.) will get control. In another words the `idtentry` macro allocates place for the registers ([pt_regs](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/uapi/asm/ptrace.h#L43) structure) on the stack, pushes dummy error code for the stack consistency if an interrupt/exception has no error code, checks the segment selector in the `cs` segment register and switches depends on the previous state(userspace or kernelspace). After all of these preparations it makes a call of an actual interrupt/exception handler: +idtentry divide_error do_divide_error has_error_code=0 +idtentry overflow do_overflow has_error_code=0 +idtentry invalid_op do_invalid_op has_error_code=0 +idtentry bounds do_bounds has_error_code=0 +idtentry device_not_available do_device_not_available has_error_code=0 +idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0 +idtentry invalid_TSS do_invalid_TSS has_error_code=1 +idtentry segment_not_present do_segment_not_present has_error_code=1 +idtentry spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0 +idtentry coprocessor_error do_coprocessor_error has_error_code=0 +idtentry alignment_check do_alignment_check has_error_code=1 +idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0 +``` + +The `idtentry` macro does following preparation before an actual exception handler (`do_divide_error` for the `divide_error`, `do_overflow` for the `overflow`, etc.) will get control. In another words the `idtentry` macro allocates place for the registers ([pt_regs](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/uapi/asm/ptrace.h#L43) structure) on the stack, pushes dummy error code for the stack consistency if an interrupt/exception has no error code, checks the segment selector in the `cs` segment register and switches depends on the previous state (userspace or kernelspace). After all of these preparations it makes a call to an actual interrupt/exception handler: ```assembly .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 @@ -84,7 +84,7 @@ DO_ERROR(X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS) DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present) DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment) DO_ERROR(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check) -``` +``` As we can see the `DO_ERROR` macro takes 4 parameters: @@ -112,7 +112,7 @@ dotraplinkage void do_divide_error(struct pt_regs *regs, long error_code) \ } ``` -We can see that all functions which are generated by the `DO_ERROR` macro just make a call of the `do_error_trap` function from the [arch/x86/kernel/traps.c](https://github.com/torvalds/linux/tree/master/arch/x86/kernel/traps.c). Let's look on implementation of the `do_error_trap` function. +We can see that all functions which are generated by the `DO_ERROR` macro just make a call to the `do_error_trap` function from the [arch/x86/kernel/traps.c](https://github.com/torvalds/linux/tree/master/arch/x86/kernel/traps.c). Let's look on implementation of the `do_error_trap` function. Trap handlers -------------------------------------------------------------------------------- @@ -150,7 +150,7 @@ enum ctx_state { } state; ``` -The second function is `exception_exit` defined in the same [include/linux/context_tracking.h](https://github.com/torvalds/linux/tree/master/include/linux/context_tracking.h) file and checks that context tracking is enabled and call the `contert_tracking_enter` function if the previous context was `user`: +The second function is `exception_exit` defined in the same [include/linux/context_tracking.h](https://github.com/torvalds/linux/tree/master/include/linux/context_tracking.h) file and checks that context tracking is enabled and call the `context_tracking_enter` function if the previous context was `user`: ```C static inline void exception_exit(enum ctx_state prev_ctx) @@ -173,7 +173,7 @@ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) != } ``` -First of all it calls the `notify_die` function which defined in the [kernel/notifier.c](https://github.com/torvalds/linux/tree/master/kernel/notifier.c). To get notified for [kernel panic](https://en.wikipedia.org/wiki/Kernel_panic), [kernel oops](https://en.wikipedia.org/wiki/Linux_kernel_oops), [Non-Maskable Interrupt](https://en.wikipedia.org/wiki/Non-maskable_interrupt) or other events the caller needs to insert itself in the `notify_die` chain and the `notify_die` function does it. The Linux kernel has special mechanism that allows kernel to ask when something happens and this mechanism called `notifiers` or `notifier chains`. This mechanism used for example for the `USB` hotplug events (look on the [drivers/usb/core/notify.c](https://github.com/torvalds/linux/tree/master/drivers/usb/core/notify.c)), for the memory [hotplug](https://en.wikipedia.org/wiki/Hot_swapping) (look on the [include/linux/memory.h](https://github.com/torvalds/linux/tree/master/include/linux/memory.h), the `hotplug_memory_notifier` macro and etc...), system reboots and etc. A notifier chain is thus a simple, singly-linked list. When a Linux kernel subsystem wants to be notified of specific events, it fills out a special `notifier_block` structure and passes it to the `notifier_chain_register` function. An event can be sent with the call of the `notifier_call_chain` function. First of all the `notify_die` function fills `die_args` structure with the trap number, trap string, registers and other values: +First of all it calls the `notify_die` function which defined in the [kernel/notifier.c](https://github.com/torvalds/linux/tree/master/kernel/notifier.c). To get notified for [kernel panic](https://en.wikipedia.org/wiki/Kernel_panic), [kernel oops](https://en.wikipedia.org/wiki/Linux_kernel_oops), [Non-Maskable Interrupt](https://en.wikipedia.org/wiki/Non-maskable_interrupt) or other events the caller needs to insert itself in the `notify_die` chain and the `notify_die` function does it. The Linux kernel has special mechanism that allows kernel to ask when something happens and this mechanism called `notifiers` or `notifier chains`. This mechanism used for example for the `USB` hotplug events (look on the [drivers/usb/core/notify.c](https://github.com/torvalds/linux/tree/master/drivers/usb/core/notify.c)), for the memory [hotplug](https://en.wikipedia.org/wiki/Hot_swapping) (look on the [include/linux/memory.h](https://github.com/torvalds/linux/tree/master/include/linux/memory.h), the `hotplug_memory_notifier` macro, etc...), system reboots, etc. A notifier chain is thus a simple, singly-linked list. When a Linux kernel subsystem wants to be notified of specific events, it fills out a special `notifier_block` structure and passes it to the `notifier_chain_register` function. An event can be sent with the call of the `notifier_call_chain` function. First of all the `notify_die` function fills `die_args` structure with the trap number, trap string, registers and other values: ```C struct die_args args = { @@ -247,7 +247,7 @@ if (!fixup_exception(regs)) { } ``` -The `die` function defined in the [arch/x86/kernel/dumpstack.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/dumpstack.c) source code file, prints useful information about stack, registers, kernel modules and caused kernel [oops](https://en.wikipedia.org/wiki/Linux_kernel_oops). If we came from the userspace the `do_trap_no_signal` function will return `-1` and the execution of the `do_trap` function will continue. If we passed through the `do_trap_no_signal` function and did not exit from the `do_trap` after this, it means that previous context was - `user`. Most exceptions caused by the processor are interpreted by Linux as error conditions, for example division by zero, invalid opcode and etc. When an exception occurs the Linux kernel sends a [signal](https://en.wikipedia.org/wiki/Unix_signal) to the interrupted process that caused the exception to notify it of an incorrect condition. So, in the `do_trap` function we need to send a signal with the given number (`SIGFPE` for the divide error, `SIGILL` for a illegal instruction and etc...). First of all we save error code and vector number in the current interrupts process with the filling `thread.error_code` and `thread_trap_nr`: +The `die` function defined in the [arch/x86/kernel/dumpstack.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/dumpstack.c) source code file, prints useful information about stack, registers, kernel modules and caused kernel [oops](https://en.wikipedia.org/wiki/Linux_kernel_oops). If we came from the userspace the `do_trap_no_signal` function will return `-1` and the execution of the `do_trap` function will continue. If we passed through the `do_trap_no_signal` function and did not exit from the `do_trap` after this, it means that previous context was - `user`. Most exceptions caused by the processor are interpreted by Linux as error conditions, for example division by zero, invalid opcode, etc. When an exception occurs the Linux kernel sends a [signal](https://en.wikipedia.org/wiki/Unix_signal) to the interrupted process that caused the exception to notify it of an incorrect condition. So, in the `do_trap` function we need to send a signal with the given number (`SIGFPE` for the divide error, `SIGILL` for a illegal instruction, etc.). First of all we save error code and vector number in the current interrupts process with the filling `thread.error_code` and `thread_trap_nr`: ```C tsk->thread.error_code = error_code; @@ -275,7 +275,7 @@ And send a given signal to interrupted process: force_sig_info(signr, info ?: SEND_SIG_PRIV, tsk); ``` -This is the end of the `do_trap`. We just saw generic implementation for eight different exceptions which are defined with the `DO_ERROR` macro. Now let's look on another exception handlers. +This is the end of the `do_trap`. We just saw generic implementation for eight different exceptions which are defined with the `DO_ERROR` macro. Now let's look at other exception handlers. Double fault -------------------------------------------------------------------------------- @@ -286,7 +286,7 @@ The next exception is `#DF` or `Double fault`. This exception occurs when the pr set_intr_gate_ist(X86_TRAP_DF, &double_fault, DOUBLEFAULT_STACK); ``` -Note that this exception runs on the `DOUBLEFAULT_STACK` [Interrupt Stack Table](https://www.kernel.org/doc/Documentation/x86/x86_64/kernel-stacks) which has index - `1`: +Note that this exception runs on the `DOUBLEFAULT_STACK` [Interrupt Stack Table](https://www.kernel.org/doc/Documentation/x86/kernel-stacks) which has index - `1`: ```C #define DOUBLEFAULT_STACK 1 @@ -485,7 +485,7 @@ Links * [printk](https://en.wikipedia.org/wiki/Printk) * [coprocessor](https://en.wikipedia.org/wiki/Coprocessor) * [SIMD](https://en.wikipedia.org/wiki/SIMD) -* [Interrupt Stack Table](https://www.kernel.org/doc/Documentation/x86/x86_64/kernel-stacks) +* [Interrupt Stack Table](https://www.kernel.org/doc/Documentation/x86/kernel-stacks) * [PID](https://en.wikipedia.org/wiki/Process_identifier) * [x87 FPU](https://en.wikipedia.org/wiki/X87) * [control register](https://en.wikipedia.org/wiki/Control_register) diff --git a/Interrupts/linux-interrupts-6.md b/Interrupts/linux-interrupts-6.md index 33838072..4a3ecac1 100644 --- a/Interrupts/linux-interrupts-6.md +++ b/Interrupts/linux-interrupts-6.md @@ -4,7 +4,7 @@ Interrupts and Interrupt Handling. Part 6. Non-maskable interrupt handler -------------------------------------------------------------------------------- -It is sixth part of the [Interrupts and Interrupt Handling in the Linux kernel](https://0xax.gitbook.io/linux-insides/summary/interrupts) chapter and in the previous [part](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-5) we saw implementation of some exception handlers for the [General Protection Fault](https://en.wikipedia.org/wiki/General_protection_fault) exception, divide exception, invalid [opcode](https://en.wikipedia.org/wiki/Opcode) exceptions and etc. As I wrote in the previous part we will see implementations of the rest exceptions in this part. We will see implementation of the following handlers: +It is sixth part of the [Interrupts and Interrupt Handling in the Linux kernel](https://0xax.gitbook.io/linux-insides/summary/interrupts) chapter and in the previous [part](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-5) we saw implementation of some exception handlers for the [General Protection Fault](https://en.wikipedia.org/wiki/General_protection_fault) exception, divide exception, invalid [opcode](https://en.wikipedia.org/wiki/Opcode) exceptions, etc. As I wrote in the previous part we will see implementations of the rest exceptions in this part. We will see implementation of the following handlers: * [Non-Maskable](https://en.wikipedia.org/wiki/Non-maskable_interrupt) interrupt; * [BOUND](http://pdos.csail.mit.edu/6.828/2005/readings/i386/BOUND.htm) Range Exceeded Exception; @@ -169,7 +169,7 @@ pushq $-1 ALLOC_PT_GPREGS_ON_STACK ``` -We already saw implementation of the `ALLOC_PT_GREGS_ON_STACK` macro in the third part of the interrupts [chapter](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-3). This macro defined in the [arch/x86/entry/calling.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/entry/calling.h) and yet another allocates `120` bytes on stack for the general purpose registers, from the `rdi` to the `r15`: +We already saw implementation of the `ALLOC_PT_GPREGS_ON_STACK` macro in the third part of the interrupts [chapter](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-3). This macro defined in the [arch/x86/entry/calling.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/entry/calling.h) and yet another allocates `120` bytes on stack for the general purpose registers, from the `rdi` to the `r15`: ```assembly .macro ALLOC_PT_GPREGS_ON_STACK addskip=0 @@ -261,7 +261,7 @@ Now let's look on the `do_nmi` exception handler. This function defined in the [ * error code. as all exception handlers. The `do_nmi` starts from the call of the `nmi_nesting_preprocess` function and ends with the call of the `nmi_nesting_postprocess`. The `nmi_nesting_preprocess` function checks that we likely do not work with the debug stack and if we on the debug stack set the `update_debug_stack` [per-cpu](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-1) variable to `1` and call the `debug_stack_set_zero` function from the [arch/x86/kernel/cpu/common.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/cpu/common.c). This function increases the `debug_stack_use_ctr` per-cpu variable and loads new `Interrupt Descriptor Table`: - + ```C static inline void nmi_nesting_preprocess(struct pt_regs *regs) { @@ -362,7 +362,7 @@ After all of this, there is still only one way when `MPX` is responsible for thi Coprocessor exception and SIMD exception -------------------------------------------------------------------------------- -The next two exceptions are [x87 FPU](https://en.wikipedia.org/wiki/X87) Floating-Point Error exception or `#MF` and [SIMD](https://en.wikipedia.org/wiki/SIMD) Floating-Point Exception or `#XF`. The first exception occurs when the `x87 FPU` has detected floating point error. For example divide by zero, numeric overflow and etc. The second exception occurs when the processor has detected [SSE/SSE2/SSE3](https://en.wikipedia.org/wiki/SSE3) `SIMD` floating-point exception. It can be the same as for the `x87 FPU`. The handlers for these exceptions are `do_coprocessor_error` and `do_simd_coprocessor_error` are defined in the [arch/x86/kernel/traps.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/traps.c) and very similar on each other. They both make a call of the `math_error` function from the same source code file but pass different vector number. The `do_coprocessor_error` passes `X86_TRAP_MF` vector number to the `math_error`: +The next two exceptions are [x87 FPU](https://en.wikipedia.org/wiki/X87) Floating-Point Error exception or `#MF` and [SIMD](https://en.wikipedia.org/wiki/SIMD) Floating-Point Exception or `#XF`. The first exception occurs when the `x87 FPU` has detected floating point error. For example divide by zero, numeric overflow, etc. The second exception occurs when the processor has detected [SSE/SSE2/SSE3](https://en.wikipedia.org/wiki/SSE3) `SIMD` floating-point exception. It can be the same as for the `x87 FPU`. The handlers for these exceptions are `do_coprocessor_error` and `do_simd_coprocessor_error` are defined in the [arch/x86/kernel/traps.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/traps.c) and very similar on each other. They both make a call of the `math_error` function from the same source code file but pass different vector number. The `do_coprocessor_error` passes `X86_TRAP_MF` vector number to the `math_error`: ```C dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) @@ -389,7 +389,7 @@ do_simd_coprocessor_error(struct pt_regs *regs, long error_code) } ``` -First of all the `math_error` function defines current interrupted task, address of its fpu, string which describes an exception, add it to the `notify_die` chain and return from the exception handler if it will return `NOTIFY_STOP`: +First of all the `math_error` function defines current interrupted task, address of its FPU, string which describes an exception, add it to the `notify_die` chain and return from the exception handler if it will return `NOTIFY_STOP`: ```C struct task_struct *task = current; @@ -434,7 +434,7 @@ After this we check the signal code and if it is non-zero we return: if (!info.si_code) return; ``` - + Or send the `SIGFPE` signal in the end: ```C @@ -446,7 +446,7 @@ That's all. Conclusion -------------------------------------------------------------------------------- -It is the end of the sixth part of the [Interrupts and Interrupt Handling](https://0xax.gitbook.io/linux-insides/summary/interrupts) chapter and we saw implementation of some exception handlers in this part, like `non-maskable` interrupt, [SIMD](https://en.wikipedia.org/wiki/SIMD) and [x87 FPU](https://en.wikipedia.org/wiki/X87) floating point exception. Finally we have finished with the `trap_init` function in this part and will go ahead in the next part. The next our point is the external interrupts and the `early_irq_init` function from the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c). +It is the end of the sixth part of the [Interrupts and Interrupt Handling](https://0xax.gitbook.io/linux-insides/summary/interrupts) chapter and we saw implementation of some exception handlers in this part, like `non-maskable` interrupt, [SIMD](https://en.wikipedia.org/wiki/SIMD) and [x87 FPU](https://en.wikipedia.org/wiki/X87) floating point exception. Finally, we finished with the `trap_init` function in this part and will go ahead in the next part. The next our point is the external interrupts and the `early_irq_init` function from the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c). If you have any questions or suggestions write me a comment or ping me at [twitter](https://twitter.com/0xAX). @@ -457,7 +457,7 @@ Links * [General Protection Fault](https://en.wikipedia.org/wiki/General_protection_fault) * [opcode](https://en.wikipedia.org/wiki/Opcode) -* [Non-Maskable](https://en.wikipedia.org/wiki/Non-maskable_interrupt) +* [Non-Maskable](https://en.wikipedia.org/wiki/Non-maskable_interrupt) * [BOUND instruction](http://pdos.csail.mit.edu/6.828/2005/readings/i386/BOUND.htm) * [CPU socket](https://en.wikipedia.org/wiki/CPU_socket) * [Interrupt Descriptor Table](https://en.wikipedia.org/wiki/Interrupt_descriptor_table) @@ -474,7 +474,7 @@ Links * [stack frame](https://en.wikipedia.org/wiki/Call_stack) * [Model Specific register](https://en.wikipedia.org/wiki/Model-specific_register) * [percpu](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-1) -* [RCU](https://en.wikipedia.org/wiki/Read-copy-update) +* [RCU](https://en.wikipedia.org/wiki/Read-copy-update) * [MPX](https://en.wikipedia.org/wiki/Intel_MPX) * [x87 FPU](https://en.wikipedia.org/wiki/X87) * [Previous part](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-5) diff --git a/Interrupts/linux-interrupts-7.md b/Interrupts/linux-interrupts-7.md index 4f96fd34..d96401b5 100644 --- a/Interrupts/linux-interrupts-7.md +++ b/Interrupts/linux-interrupts-7.md @@ -4,9 +4,9 @@ Interrupts and Interrupt Handling. Part 7. Introduction to external interrupts -------------------------------------------------------------------------------- -This is the seventh part of the Interrupts and Interrupt Handling in the Linux kernel [chapter](https://0xax.gitbook.io/linux-insides/summary/interrupts) and in the previous [part](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-6) we have finished with the exceptions which are generated by the processor. In this part we will continue to dive to the interrupt handling and will start with the external hardware interrupt handling. As you can remember, in the previous part we have finished with the `trap_init` function from the [arch/x86/kernel/trap.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/traps.c) and the next step is the call of the `early_irq_init` function from the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c). +This is the seventh part of the Interrupts and Interrupt Handling in the Linux kernel [chapter](https://0xax.gitbook.io/linux-insides/summary/interrupts) and in the previous [part](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-6) we have finished with the exceptions which are generated by the processor. In this part we will continue to dive to the interrupt handling and will start with the external hardware interrupt handling. As you can remember, in the previous part we have finished with the `trap_init` function from the [arch/x86/kernel/trap.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/traps.c) and the next step is the call of the `early_irq_init` function from [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c). -Interrupts are signal that are sent across [IRQ](https://en.wikipedia.org/wiki/Interrupt_request_%28PC_architecture%29) or `Interrupt Request Line` by a hardware or software. External hardware interrupts allow devices like keyboard, mouse and etc, to indicate that it needs attention of the processor. Once the processor receives the `Interrupt Request`, it will temporary stop execution of the running program and invoke special routine which depends on an interrupt. We already know that this routine is called interrupt handler (or how we will call it `ISR` or `Interrupt Service Routine` from this part). The `ISR` or `Interrupt Handler Routine` can be found in Interrupt Vector table that is located at fixed address in the memory. After the interrupt is handled processor resumes the interrupted process. At the boot/initialization time, the Linux kernel identifies all devices in the machine, and appropriate interrupt handlers are loaded into the interrupt table. As we saw in the previous parts, most exceptions are handled simply by the sending a [Unix signal](https://en.wikipedia.org/wiki/Unix_signal) to the interrupted process. That's why kernel is can handle an exception quickly. Unfortunately we can not use this approach for the external hardware interrupts, because often they arrive after (and sometimes long after) the process to which they are related has been suspended. So it would make no sense to send a Unix signal to the current process. External interrupt handling depends on the type of an interrupt: +Interrupts are signal that are sent across [IRQ](https://en.wikipedia.org/wiki/Interrupt_request_%28PC_architecture%29) or `Interrupt Request Line` by a hardware or software. External hardware interrupts allow devices like keyboard, mouse and etc, to indicate that it needs attention of the processor. Once the processor receives the `Interrupt Request`, it will temporary stop execution of the running program and invoke special routine which depends on an interrupt. We already know that this routine is called interrupt handler (or how we will call it `ISR` or `Interrupt Service Routine` from this part). The `ISR` or `Interrupt Handler Routine` can be found in Interrupt Vector table that is located at fixed address in the memory. After the interrupt is handled processor resumes the interrupted process. At the boot/initialization time, the Linux kernel identifies all devices in the machine, and appropriate interrupt handlers are loaded into the interrupt table. As we saw in the previous parts, most exceptions are handled simply by the sending a [Unix signal](https://en.wikipedia.org/wiki/Unix_signal) to the interrupted process. That's how the kernel can handle an exception quickly. Unfortunately we can not use this approach for the external hardware interrupts, because often they arrive after (and sometimes long after) the process to which they are related has been suspended. So it would make no sense to send a Unix signal to the current process. External interrupt handling depends on the type of an interrupt: * `I/O` interrupts; * Timer interrupts; @@ -21,7 +21,7 @@ Generally, a handler of an `I/O` interrupt must be flexible enough to service se * Execute the interrupt service routine (next we will call it `ISR`) which is associated with the device; * Restore registers and return from an interrupt; -Ok, we know a little theory and now let's start with the `early_irq_init` function. The implementation of the `early_irq_init` function is in the [kernel/irq/irqdesc.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/irq/irqdesc.c). This function make early initialization of the `irq_desc` structure. The `irq_desc` structure is the foundation of interrupt management code in the Linux kernel. An array of this structure, which has the same name - `irq_desc`, keeps track of every interrupt request source in the Linux kernel. This structure defined in the [include/linux/irqdesc.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/irqdesc.h) and as you can note it depends on the `CONFIG_SPARSE_IRQ` kernel configuration option. This kernel configuration option enables support for sparse irqs. The `irq_desc` structure contains many different files: +Ok, we know a little theory and now let's start with the `early_irq_init` function. The implementation of the `early_irq_init` function is in the [kernel/irq/irqdesc.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/irq/irqdesc.c). This function make early initialization of the `irq_desc` structure. The `irq_desc` structure is the foundation of interrupt management code in the Linux kernel. An array of this structure, which has the same name - `irq_desc`, keeps track of every interrupt request source in the Linux kernel. This structure defined in the [include/linux/irqdesc.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/irqdesc.h) and as you can note it depends on the `CONFIG_SPARSE_IRQ` kernel configuration option. This kernel configuration option enables support for sparse IRQs. The `irq_desc` structure contains many different fields: * `irq_common_data` - per irq and chip data passed down to chip functions; * `status_use_accessors` - contains status of the interrupt source which is combination of the values from the `enum` from the [include/linux/irq.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/irq.h) and different macros which are defined in the same source code file; @@ -29,7 +29,7 @@ Ok, we know a little theory and now let's start with the `early_irq_init` functi * `handle_irq` - highlevel irq-events handler; * `action` - identifies the interrupt service routines to be invoked when the [IRQ](https://en.wikipedia.org/wiki/Interrupt_request_%28PC_architecture%29) occurs; * `irq_count` - counter of interrupt occurrences on the IRQ line; -* `depth` - `0` if the IRQ line is enabled and a positive value if it has been disabled at least once; +* `depth` - `0` if the IRQ line is enabled and a positive value if it has been disabled at least once; * `last_unhandled` - aging timer for unhandled count; * `irqs_unhandled` - count of the unhandled interrupts; * `lock` - a spin lock used to serialize the accesses to the `IRQ` descriptor; @@ -75,10 +75,10 @@ As I already wrote, implementation of the `first_online_node` macro depends on t #if MAX_NUMNODES > 1 #define first_online_node first_node(node_states[N_ONLINE]) #else - #define first_online_node 0 + #define first_online_node 0 ``` -The `node_states` is the [enum](https://en.wikipedia.org/wiki/Enumerated_type) which defined in the [include/linux/nodemask.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/nodemask.h) and represent the set of the states of a node. In our case we are searching an online node and it will be `0` if `MAX_NUMNODES` is one or zero. If the `MAX_NUMNODES` is greater than one, the `node_states[N_ONLINE]` will return `1` and the `first_node` macro will be expands to the call of the `__first_node` function which will return `minimal` or the first online node: +The `node_states` is the [enum](https://en.wikipedia.org/wiki/Enumerated_type) which defined in the [include/linux/nodemask.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/nodemask.h) and represent the set of the states of a node. In our case we are searching an online node and it will be `0` if `MAX_NUMNODES` is one or zero. If the `MAX_NUMNODES` is greater than one, the `node_states[N_ONLINE]` will return `1` and the `first_node` macro will be expanded to the call of the `__first_node` function which will return `minimal` or the first online node: ```C #define first_node(src) __first_node(&(src)) @@ -113,7 +113,7 @@ static void __init init_irq_default_affinity(void) #endif ``` -We know that when a hardware, such as disk controller or keyboard, needs attention from the processor, it throws an interrupt. The interrupt tells to the processor that something has happened and that the processor should interrupt current process and handle an incoming event. In order to prevent multiple devices from sending the same interrupts, the [IRQ](https://en.wikipedia.org/wiki/Interrupt_request_%28PC_architecture%29) system was established where each device in a computer system is assigned its own special IRQ so that its interrupts are unique. Linux kernel can assign certain `IRQs` to specific processors. This is known as `SMP IRQ affinity`, and it allows you control how your system will respond to various hardware events (that's why it has certain implementation only if the `CONFIG_SMP` kernel configuration option is set). After we allocated `irq_default_affinity` cpumask, we can see `printk` output: +We know that when a hardware, such as disk controller or keyboard, needs attention from the processor, it throws an interrupt. The interrupt tells to the processor that something has happened and that the processor should interrupt current process and handle an incoming event. In order to prevent multiple devices from sending the same interrupts, the [IRQ](https://en.wikipedia.org/wiki/Interrupt_request_%28PC_architecture%29) system was established where each device in a computer system is assigned its own special IRQ so that its interrupts are unique. Linux kernel can assign certain `IRQs` to specific processors. This is known as `SMP IRQ affinity`, and it allows you to control how your system will respond to various hardware events (that's why it has certain implementation only if the `CONFIG_SMP` kernel configuration option is set). After we allocated `irq_default_affinity` cpumask, we can see `printk` output: ```C printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS); @@ -189,7 +189,7 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { The `irq_desc` is array of the `irq` descriptors. It has three already initialized fields: -* `handle_irq` - as I already wrote above, this field is the highlevel irq-event handler. In our case it initialized with the `handle_bad_irq` function that defined in the [kernel/irq/handle.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/irq/handle.c) source code file and handles spurious and unhandled irqs; +* `handle_irq` - as I already wrote above, this field is the highlevel irq-event handler. In our case it initialized with the `handle_bad_irq` function that defined in the [kernel/irq/handle.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/irq/handle.c) source code file and handles spurious and unhandled IRQs; * `depth` - `0` if the IRQ line is enabled and a positive value if it has been disabled at least once; * `lock` - A spin lock used to serialize the accesses to the `IRQ` descriptor. @@ -258,7 +258,7 @@ irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED); ... ``` -In the next step we set the high level interrupt handlers to the `handle_bad_irq` which handles spurious and unhandled irqs (as the hardware stuff is not initialized yet, we set this handler), set `irq_desc.desc` to `1` which means that an `IRQ` is disabled, reset count of the unhandled interrupts and interrupts in general: +In the next step we set the high level interrupt handlers to the `handle_bad_irq` which handles spurious and unhandled IRQs (as the hardware stuff is not initialized yet, we set this handler), set `irq_desc.desc` to `1` which means that an `IRQ` is disabled, reset count of the unhandled interrupts and interrupts in general: ```C ... @@ -294,14 +294,14 @@ static void desc_smp_init(struct irq_desc *desc, int node) #endif } ``` - + In the end of the `early_irq_init` function we return the return value of the `arch_early_irq_init` function: ```C return arch_early_irq_init(); ``` -This function defined in the [kernel/apic/vector.c](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/apic/vector.c) and contains only one call of the `arch_early_ioapic_init` function from the [kernel/apic/io_apic.c](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/apic/io_apic.c). As we can understand from the `arch_early_ioapic_init` function's name, this function makes early initialization of the [I/O APIC](https://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller). First of all it make a check of the number of the legacy interrupts with the call of the `nr_legacy_irqs` function. If we have no legacy interrupts with the [Intel 8259](https://en.wikipedia.org/wiki/Intel_8259) programmable interrupt controller we set `io_apic_irqs` to the `0xffffffffffffffff`: +This function defined in the [kernel/apic/vector.c](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/apic/vector.c) and contains only one call of the `arch_early_ioapic_init` function from the [kernel/apic/io_apic.c](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/apic/io_apic.c). As we can understand from the `arch_early_ioapic_init` function's name, this function makes early initialization of the [I/O APIC](https://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller). First of all it make a check of the number of the legacy interrupts with the call of the `nr_legacy_irqs` function. If we have no legacy interrupts with the [Intel 8259](https://en.wikipedia.org/wiki/Intel_8259) programmable interrupt controller we set `io_apic_irqs` to the `0xffffffffffffffff`: ```C if (!nr_legacy_irqs()) @@ -315,7 +315,7 @@ for_each_ioapic(i) alloc_ioapic_saved_registers(i); ``` -And in the end of the `arch_early_ioapic_init` function we are going through the all legacy irqs (from `IRQ0` to `IRQ15`) in the loop and allocate space for the `irq_cfg` which represents configuration of an irq on the given `NUMA` node: +And in the end of the `arch_early_ioapic_init` function we are going through the all legacy IRQs (from `IRQ0` to `IRQ15`) in the loop and allocate space for the `irq_cfg` which represents configuration of an irq on the given `NUMA` node: ```C for (i = 0; i < nr_legacy_irqs(); i++) { @@ -330,7 +330,7 @@ That's all. Sparse IRQs -------------------------------------------------------------------------------- -We already saw in the beginning of this part that implementation of the `early_irq_init` function depends on the `CONFIG_SPARSE_IRQ` kernel configuration option. Previously we saw implementation of the `early_irq_init` function when the `CONFIG_SPARSE_IRQ` configuration option is not set, now let's look on the its implementation when this option is set. Implementation of this function very similar, but little differ. We can see the same definition of variables and call of the `init_irq_default_affinity` in the beginning of the `early_irq_init` function: +We already saw in the beginning of this part that implementation of the `early_irq_init` function depends on the `CONFIG_SPARSE_IRQ` kernel configuration option. Previously we saw implementation of the `early_irq_init` function when the `CONFIG_SPARSE_IRQ` configuration option is not set, now let's look at its implementation when this option is set. Implementation of this function very similar, but little differ. We can see the same definition of variables and call of the `init_irq_default_affinity` in the beginning of the `early_irq_init` function: ```C #ifdef CONFIG_SPARSE_IRQ @@ -356,7 +356,7 @@ But after this we can see the following call: initcnt = arch_probe_nr_irqs(); ``` -The `arch_probe_nr_irqs` function defined in the [arch/x86/kernel/apic/vector.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/apic/vector.c) and calculates count of the pre-allocated irqs and update `nr_irqs` with its number. But stop. Why there are pre-allocated irqs? There is alternative form of interrupts called - [Message Signaled Interrupts](https://en.wikipedia.org/wiki/Message_Signaled_Interrupts) available in the [PCI](https://en.wikipedia.org/wiki/Conventional_PCI). Instead of assigning a fixed number of the interrupt request, the device is allowed to record a message at a particular address of RAM, in fact, the display on the [Local APIC](https://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller#Integrated_local_APICs). `MSI` permits a device to allocate `1`, `2`, `4`, `8`, `16` or `32` interrupts and `MSI-X` permits a device to allocate up to `2048` interrupts. Now we know that irqs can be pre-allocated. More about `MSI` will be in a next part, but now let's look on the `arch_probe_nr_irqs` function. We can see the check which assign amount of the interrupt vectors for the each processor in the system to the `nr_irqs` if it is greater and calculate the `nr` which represents number of `MSI` interrupts: +The `arch_probe_nr_irqs` function defined in the [arch/x86/kernel/apic/vector.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/apic/vector.c) and calculates count of the pre-allocated IRQs and update `nr_irqs` with this number. But stop. Why are there pre-allocated IRQs? There is alternative form of interrupts called - [Message Signaled Interrupts](https://en.wikipedia.org/wiki/Message_Signaled_Interrupts) available in the [PCI](https://en.wikipedia.org/wiki/Conventional_PCI). Instead of assigning a fixed number of the interrupt request, the device is allowed to record a message at a particular address of RAM, in fact, the display on the [Local APIC](https://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller#Integrated_local_APICs). `MSI` permits a device to allocate `1`, `2`, `4`, `8`, `16` or `32` interrupts and `MSI-X` permits a device to allocate up to `2048` interrupts. Now we know that IRQs can be pre-allocated. More about `MSI` will be in a next part, but now let's look on the `arch_probe_nr_irqs` function. We can see the check which assign amount of the interrupt vectors for the each processor in the system to the `nr_irqs` if it is greater and calculate the `nr` which represents number of `MSI` interrupts: ```C int nr_irqs = NR_IRQS; @@ -367,7 +367,7 @@ if (nr_irqs > (NR_VECTORS * nr_cpu_ids)) nr = (gsi_top + nr_legacy_irqs()) + 8 * nr_cpu_ids; ``` -Take a look on the `gsi_top` variable. Each `APIC` is identified with its own `ID` and with the offset where its `IRQ` starts. It is called `GSI` base or `Global System Interrupt` base. So the `gsi_top` represents it. We get the `Global System Interrupt` base from the [MultiProcessor Configuration Table](https://en.wikipedia.org/wiki/MultiProcessor_Specification) table (you can remember that we have parsed this table in the sixth [part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-6) of the Linux Kernel initialization process chapter). +Take a look on the `gsi_top` variable. Each `APIC` is identified with its own `ID` and with the offset where its `IRQ` starts. It is called `GSI` base or `Global System Interrupt` base. So the `gsi_top` represents it. We get the `Global System Interrupt` base from the [MultiProcessor Configuration Table](https://en.wikipedia.org/wiki/MultiProcessor_Specification) table (you can remember that we have parsed this table in the sixth [part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-6) of the Linux kernel initialization process chapter). After this we update the `nr` depends on the value of the `gsi_top`: @@ -380,7 +380,7 @@ After this we update the `nr` depends on the value of the `gsi_top`: #endif ``` -Update the `nr_irqs` if it less than `nr` and return the number of the legacy irqs: +Update the `nr_irqs` if it less than `nr` and return the number of the legacy IRQs: ```C if (nr < nr_irqs) diff --git a/Interrupts/linux-interrupts-8.md b/Interrupts/linux-interrupts-8.md index 74888f83..7adc926a 100644 --- a/Interrupts/linux-interrupts-8.md +++ b/Interrupts/linux-interrupts-8.md @@ -113,10 +113,10 @@ In the end of the `init_IRQ` function we can see the call of the following funct x86_init.irqs.intr_init(); ``` -from the [arch/x86/kernel/x86_init.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/x86_init.c) source code file. If you have read [chapter](https://0xax.gitbook.io/linux-insides/summary/initialization) about the Linux kernel initialization process, you can remember the `x86_init` structure. This structure contains a couple of files which are points to the function related to the platform setup (`x86_64` in our case), for example `resources` - related with the memory resources, `mpparse` - related with the parsing of the [MultiProcessor Configuration Table](https://en.wikipedia.org/wiki/MultiProcessor_Specification) table and etc.). As we can see the `x86_init` also contains the `irqs` field which contains three following fields: +from the [arch/x86/kernel/x86_init.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/x86_init.c) source code file. If you have read [chapter](https://0xax.gitbook.io/linux-insides/summary/initialization) about the Linux kernel initialization process, you can remember the `x86_init` structure. This structure contains a couple of files which point to the function related to the platform setup (`x86_64` in our case), for example `resources` - related with the memory resources, `mpparse` - related with the parsing of the [MultiProcessor Configuration Table](https://en.wikipedia.org/wiki/MultiProcessor_Specification) table, etc.). As we can see the `x86_init` also contains the `irqs` field which contains the three following fields: ```C -struct x86_init_ops x86_init __initdata +struct x86_init_ops x86_init __initdata { ... ... @@ -132,7 +132,7 @@ struct x86_init_ops x86_init __initdata } ``` -Now, we are interesting in the `native_init_IRQ`. As we can note, the name of the `native_init_IRQ` function contains the `native_` prefix which means that this function is architecture-specific. It defined in the [arch/x86/kernel/irqinit.c](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/irqinit.c) and executes general initialization of the [Local APIC](https://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller#Integrated_local_APICs) and initialization of the [ISA](https://en.wikipedia.org/wiki/Industry_Standard_Architecture) irqs. Let's look on the implementation of the `native_init_IRQ` function and will try to understand what occurs there. The `native_init_IRQ` function starts from the execution of the following function: +Now, we are interesting in the `native_init_IRQ`. As we can note, the name of the `native_init_IRQ` function contains the `native_` prefix which means that this function is architecture-specific. It defined in the [arch/x86/kernel/irqinit.c](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/irqinit.c) and executes general initialization of the [Local APIC](https://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller#Integrated_local_APICs) and initialization of the [ISA](https://en.wikipedia.org/wiki/Industry_Standard_Architecture) irqs. Let's look at the implementation of the `native_init_IRQ` function and try to understand what occurs there. The `native_init_IRQ` function starts from the execution of the following function: ```C x86_init.irqs.pre_vector_init(); @@ -155,21 +155,21 @@ The `irq_chip` structure defined in the [include/linux/irq.h](https://github.com ```C $ cat /proc/interrupts - CPU0 CPU1 CPU2 CPU3 CPU4 CPU5 CPU6 CPU7 + CPU0 CPU1 CPU2 CPU3 CPU4 CPU5 CPU6 CPU7 0: 16 0 0 0 0 0 0 0 IO-APIC 2-edge timer 1: 2 0 0 0 0 0 0 0 IO-APIC 1-edge i8042 8: 1 0 0 0 0 0 0 0 IO-APIC 8-edge rtc0 ``` -look on the last column; +look at the last column; * `(*irq_mask)(struct irq_data *data)` - mask an interrupt source; * `(*irq_ack)(struct irq_data *data)` - start of a new interrupt; * `(*irq_startup)(struct irq_data *data)` - start up the interrupt; * `(*irq_shutdown)(struct irq_data *data)` - shutdown the interrupt -* and etc. +* etc. -fields. Note that the `irq_data` structure represents set of the per irq chip data passed down to chip functions. It contains `mask` - precomputed bitmask for accessing the chip registers, `irq` - interrupt number, `hwirq` - hardware interrupt number, local to the interrupt domain chip low level interrupt hardware access and etc. +fields. Note that the `irq_data` structure represents set of the per irq chip data passed down to chip functions. It contains `mask` - precomputed bitmask for accessing the chip registers, `irq` - interrupt number, `hwirq` - hardware interrupt number, local to the interrupt domain chip low level interrupt hardware access, etc. After this depends on the `CONFIG_X86_64` and `CONFIG_X86_LOCAL_APIC` kernel configuration option call the `init_bsp_APIC` function from the [arch/x86/kernel/apic/apic.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/apic/apic.c): @@ -186,7 +186,7 @@ if (smp_found_config || !cpu_has_apic) return; ``` -In other way we return from this function. In the next step we call the `clear_local_APIC` function from the same source code file that shutdowns the local `APIC` (more about it will be in the chapter about the `Advanced Programmable Interrupt Controller`) and enable `APIC` of the first processor by the setting `unsigned int value` to the `APIC_SPIV_APIC_ENABLED`: +Otherwise, we return from this function. In the next step we call the `clear_local_APIC` function from the same source code file that shuts down the local `APIC` (more on it in the `Advanced Programmable Interrupt Controller` chapter) and enable `APIC` of the first processor by the setting `unsigned int value` to the `APIC_SPIV_APIC_ENABLED`: ```C value = apic_read(APIC_SPIV); @@ -200,11 +200,11 @@ and writing it with the help of the `apic_write` function: apic_write(APIC_SPIV, value); ``` -After we have enabled `APIC` for the bootstrap processor, we return to the `init_ISA_irqs` function and in the next step we initialize legacy `Programmable Interrupt Controller` and set the legacy chip and handler for the each legacy irq: +After we have enabled `APIC` for the bootstrap processor, we return to the `init_ISA_irqs` function and in the next step we initialize legacy `Programmable Interrupt Controller` and set the legacy chip and handler for each legacy irq: ```C legacy_pic->init(0); - + for (i = 0; i < nr_legacy_irqs(); i++) irq_set_chip_and_handler(i, chip, handle_level_irq); ``` @@ -229,7 +229,7 @@ struct legacy_pic default_legacy_pic = { } ``` -The `init_8259A` function defined in the same source code file and executes initialization of the [Intel 8259](https://en.wikipedia.org/wiki/Intel_8259) ``Programmable Interrupt Controller` (more about it will be in the separate chapter about `Programmable Interrupt Controllers` and `APIC`). +The `init_8259A` function defined in the same source code file and executes initialization of the [Intel 8259](https://en.wikipedia.org/wiki/Intel_8259) `Programmable Interrupt Controller` (more about it will be in the separate chapter about `Programmable Interrupt Controllers` and `APIC`). Now we can return to the `native_init_IRQ` function, after the `init_ISA_irqs` function finished its work. The next step is the call of the `apic_intr_init` function that allocates special interrupt gates which are used by the [SMP](https://en.wikipedia.org/wiki/Symmetric_multiprocessing) architecture for the [Inter-processor interrupt](https://en.wikipedia.org/wiki/Inter-processor_interrupt). The `alloc_intr_gate` macro from the [arch/x86/include/asm/desc.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/desc.h) used for the interrupt descriptor allocation: @@ -253,7 +253,7 @@ if (!test_bit(vector, used_vectors)) { } ``` -We already saw the `set_bit` macro, now let's look on the `test_bit` and the `first_system_vector`. The first `test_bit` macro defined in the [arch/x86/include/asm/bitops.h](https://github.com/torvalds/linux/blob/master/arch/x86/include/asm/bitops.h) and looks like this: +We already saw the `set_bit` macro, now let's look at the `test_bit` and the `first_system_vector`. The first `test_bit` macro defined in the [arch/x86/include/asm/bitops.h](https://github.com/torvalds/linux/blob/master/arch/x86/include/asm/bitops.h) and looks like this: ```C #define test_bit(nr, addr) \ @@ -262,7 +262,7 @@ We already saw the `set_bit` macro, now let's look on the `test_bit` and the `fi : variable_test_bit((nr), (addr))) ``` -We can see the [ternary operator](https://en.wikipedia.org/wiki/Ternary_operation) here make a test with the [gcc](https://en.wikipedia.org/wiki/GNU_Compiler_Collection) built-in function `__builtin_constant_p` tests that given vector number (`nr`) is known at compile time. If you're feeling misunderstanding of the `__builtin_constant_p`, we can make simple test: +We can see the [ternary operator](https://en.wikipedia.org/wiki/Ternary_operation) here makes a test with the [gcc](https://en.wikipedia.org/wiki/GNU_Compiler_Collection) built-in function `__builtin_constant_p` tests that given vector number (`nr`) is known at compile time. If you're feeling misunderstanding of the `__builtin_constant_p`, we can make simple test: ```C #include @@ -279,7 +279,7 @@ int main() { } ``` -and look on the result: +and look at the result: ``` $ gcc test.c -o test @@ -289,7 +289,7 @@ __builtin_constant_p(PREDEFINED_VAL) is 1 __builtin_constant_p(100) is 1 ``` -Now I think it must be clear for you. Let's get back to the `test_bit` macro. If the `__builtin_constant_p` will return non-zero, we call `constant_test_bit` function: +Now I think it must be clear for you. Let's get back to the `test_bit` macro. If the `__builtin_constant_p` returns non-zero, we call `constant_test_bit` function: ```C static inline int constant_test_bit(int nr, const void *addr) @@ -313,7 +313,7 @@ static inline int variable_test_bit(int nr, const void *addr) } ``` -What's the difference between two these functions and why do we need in two different functions for the same purpose? As you already can guess main purpose is optimization. If we will write simple example with these functions: +What's the difference between two these functions and why do we need in two different functions for the same purpose? As you already can guess main purpose is optimization. If we write simple example with these functions: ```C #define CONST 25 @@ -326,7 +326,7 @@ int main() { } ``` -and will look on the assembly output of our example we will see following assembly code: +and will look at the assembly output of our example we will see following assembly code: ```assembly pushq %rbp @@ -351,10 +351,10 @@ movl %eax, %edi call variable_test_bit ``` -for the `variable_test_bit`. These two code listings starts with the same part, first of all we save base of the current stack frame in the `%rbp` register. But after this code for both examples is different. In the first example we put `$268435456` (here the `$268435456` is our second parameter - `0x10000000`) to the `esi` and `$25` (our first parameter) to the `edi` register and call `constant_test_bit`. We put function parameters to the `esi` and `edi` registers because as we are learning Linux kernel for the `x86_64` architecture we use `System V AMD64 ABI` [calling convention](https://en.wikipedia.org/wiki/X86_calling_conventions). All is pretty simple. When we are using predefined constant, the compiler can just substitute its value. Now let's look on the second part. As you can see here, the compiler can not substitute value from the `nr` variable. In this case compiler must calculate its offset on the program's [stack frame](https://en.wikipedia.org/wiki/Call_stack). We subtract `16` from the `rsp` register to allocate stack for the local variables data and put the `$24` (value of the `nr` variable) to the `rbp` with offset `-4`. Our stack frame will be like this: +for the `variable_test_bit`. These two code listings starts with the same part, first of all we save base of the current stack frame in the `%rbp` register. But after this code for both examples is different. In the first example we put `$268435456` (here the `$268435456` is our second parameter - `0x10000000`) to the `esi` and `$25` (our first parameter) to the `edi` register and call `constant_test_bit`. We put function parameters to the `esi` and `edi` registers because as we are learning Linux kernel for the `x86_64` architecture we use `System V AMD64 ABI` [calling convention](https://en.wikipedia.org/wiki/X86_calling_conventions). All is pretty simple. When we are using predefined constant, the compiler can just substitute its value. Now let's look at the second part. As you can see here, the compiler can not substitute value from the `nr` variable. In this case compiler must calculate its offset on the program's [stack frame](https://en.wikipedia.org/wiki/Call_stack). We subtract `16` from the `rsp` register to allocate stack for the local variables data and put the `$24` (value of the `nr` variable) to the `rbp` with offset `-4`. Our stack frame will be like this: ``` - <- stack grows + <- stack grows %[rbp] | @@ -367,7 +367,7 @@ for the `variable_test_bit`. These two code listings starts with the same part, %[rsp] ``` -After this we put this value to the `eax`, so `eax` register now contains value of the `nr`. In the end we do the same that in the first example, we put the `$268435456` (the first parameter of the `variable_test_bit` function) and the value of the `eax` (value of `nr`) to the `edi` register (the second parameter of the `variable_test_bit function`). +After this we put this value to the `eax`, so `eax` register now contains value of the `nr`. In the end we do the same that in the first example, we put the `$268435456` (the first parameter of the `variable_test_bit` function) and the value of the `eax` (value of `nr`) to the `edi` register (the second parameter of the `variable_test_bit function`). The next step after the `apic_intr_init` function will finish its work is the setting interrupt gates from the `FIRST_EXTERNAL_VECTOR` or `0x20` up to `0x100`: @@ -408,7 +408,7 @@ if (!acpi_ioapic && !of_ioapic && nr_legacy_irqs()) setup_irq(2, &irq2); ``` -First of all let's deal with the condition. The `acpi_ioapic` variable represents existence of [I/O APIC](https://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller#I.2FO_APICs). It defined in the [arch/x86/kernel/acpi/boot.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/acpi/boot.c). This variable set in the `acpi_set_irq_model_ioapic` function that called during the processing `Multiple APIC Description Table`. This occurs during initialization of the architecture-specific stuff in the [arch/x86/kernel/setup.c](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/setup.c) (more about it we will know in the other chapter about [APIC](https://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller)). Note that the value of the `acpi_ioapic` variable depends on the `CONFIG_ACPI` and `CONFIG_X86_LOCAL_APIC` Linux kernel configuration options. If these options did not set, this variable will be just zero: +First of all let's deal with the condition. The `acpi_ioapic` variable represents existence of [I/O APIC](https://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller#I.2FO_APICs). It defined in the [arch/x86/kernel/acpi/boot.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/acpi/boot.c). This variable set in the `acpi_set_irq_model_ioapic` function that called during the processing `Multiple APIC Description Table`. This occurs during initialization of the architecture-specific stuff in the [arch/x86/kernel/setup.c](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/setup.c) (more about it we will know in the other chapter about [APIC](https://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller)). Note that the value of the `acpi_ioapic` variable depends on the `CONFIG_ACPI` and `CONFIG_X86_LOCAL_APIC` Linux kernel configuration options. If these options were not set, this variable will be just zero: ```C #define acpi_ioapic 0 @@ -430,7 +430,7 @@ extern int of_ioapic; #endif ``` -If the condition will return non-zero value we call the: +If the condition returns non-zero value we call the: ```C setup_irq(2, &irq2); @@ -465,7 +465,7 @@ Some time ago interrupt controller consisted of two chips and one was connected * `IRQ 6` - drive controller; * `IRQ 7` - `LPT1`. -The `setup_irq` function defined in the [kernel/irq/manage.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/irq/manage.c) and takes two parameters: +The `setup_irq` function is defined in the [kernel/irq/manage.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/irq/manage.c) and takes two parameters: * vector number of an interrupt; * `irqaction` structure related with an interrupt. @@ -476,7 +476,7 @@ This function initializes interrupt descriptor from the given vector number at t struct irq_desc *desc = irq_to_desc(irq); ``` -And call the `__setup_irq` function that setups given interrupt: +And call the `__setup_irq` function that sets up given interrupt: ```C chip_bus_lock(desc); @@ -485,7 +485,7 @@ chip_bus_sync_unlock(desc); return retval; ``` -Note that the interrupt descriptor is locked during `__setup_irq` function will work. The `__setup_irq` function makes many different things: It creates a handler thread when a thread function is supplied and the interrupt does not nest into another interrupt thread, sets the flags of the chip, fills the `irqaction` structure and many many more. +Note that the interrupt descriptor is locked during `__setup_irq` function will work. The `__setup_irq` function does many different things: it creates a handler thread when a thread function is supplied and the interrupt does not nest into another interrupt thread, sets the flags of the chip, fills the `irqaction` structure and many many more. All of the above it creates `/prov/vector_number` directory and fills it, but if you are using modern computer all values will be zero there: @@ -493,16 +493,16 @@ All of the above it creates `/prov/vector_number` directory and fills it, but if $ cat /proc/irq/2/node 0 -$cat /proc/irq/2/affinity_hint +$cat /proc/irq/2/affinity_hint 00 -cat /proc/irq/2/spurious +cat /proc/irq/2/spurious count 0 unhandled 0 last_unhandled 0 ms ``` -because probably `APIC` handles interrupts on the our machine. +because probably `APIC` handles interrupts on the machine. That's all. diff --git a/Interrupts/linux-interrupts-9.md b/Interrupts/linux-interrupts-9.md index c2881c02..d2350f66 100644 --- a/Interrupts/linux-interrupts-9.md +++ b/Interrupts/linux-interrupts-9.md @@ -16,7 +16,7 @@ As you can understand, it is almost impossible to make so that both characterist * Top half; * Bottom half; -In the past there was one way to defer interrupt handling in Linux kernel. And it was called: `the bottom half` of the processor, but now it is already not actual. Now this term has remained as a common noun referring to all the different ways of organizing deferred processing of an interrupt.The deferred processing of an interrupt suggests that some of the actions for an interrupt may be postponed to a later execution when the system will be less loaded. As you can suggest, an interrupt handler can do large amount of work that is impermissible as it executes in the context where interrupts are disabled. That's why processing of an interrupt can be split on two different parts. In the first part, the main handler of an interrupt does only minimal and the most important job. After this it schedules the second part and finishes its work. When the system is less busy and context of the processor allows to handle interrupts, the second part starts its work and finishes to process remaining part of a deferred interrupt. +In the past there was one way to defer interrupt handling in Linux kernel. And it was called: `the bottom half` of the processor, but now it is already not actual. Now this term has remained as a common noun referring to all the different ways of organizing deferred processing of an interrupt.The deferred processing of an interrupt suggests that some of the actions for an interrupt may be postponed to a later execution when the system will be less loaded. As you can suggest, an interrupt handler can do large amount of work that is impermissible as it executes in the context where interrupts are disabled. That's why processing of an interrupt can be split in two different parts. In the first part, the main handler of an interrupt does only minimal and the most important job. After this it schedules the second part and finishes its work. When the system is less busy and context of the processor allows to handle interrupts, the second part starts its work and finishes to process remaining part of a deferred interrupt. There are three types of `deferred interrupts` in the Linux kernel: @@ -24,7 +24,7 @@ There are three types of `deferred interrupts` in the Linux kernel: * `tasklets`; * `workqueues`; -And we will see description of all of these types in this part. As I said, we saw only a little bit about this theme, so, now is time to dive deep into details about this theme. +And we will see a description of all of these types in this part. As I said, we saw only a little bit about this theme, so, now is time to dive deep into details about this theme. Softirqs ---------------------------------------------------------------------------------- @@ -43,7 +43,7 @@ $ systemd-cgls -k | grep ksoft ├─ 43 [ksoftirqd/7] ``` -The `spawn_ksoftirqd` function starts this these threads. As we can see this function called as early [initcall](https://kernelnewbies.org/Documents/InitcallMechanism): +The `spawn_ksoftirqd` function starts these threads. As we can see this function called as early [initcall](https://kernelnewbies.org/Documents/InitcallMechanism): ```C early_initcall(spawn_ksoftirqd); @@ -101,8 +101,8 @@ const char * const softirq_to_name[NR_SOFTIRQS] = { Or we can see it in the output of the `/proc/softirqs`: ``` -~$ cat /proc/softirqs - CPU0 CPU1 CPU2 CPU3 CPU4 CPU5 CPU6 CPU7 +~$ cat /proc/softirqs + CPU0 CPU1 CPU2 CPU3 CPU4 CPU5 CPU6 CPU7 HI: 5 0 0 0 0 0 0 0 TIMER: 332519 310498 289555 272913 282535 279467 282895 270979 NET_TX: 2320 0 0 2 1 1 0 0 @@ -139,7 +139,7 @@ void raise_softirq(unsigned int nr) Here we can see the call of the `raise_softirq_irqoff` function between the `local_irq_save` and the `local_irq_restore` macros. The `local_irq_save` defined in the [include/linux/irqflags.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/irqflags.h) header file and saves the state of the [IF](https://en.wikipedia.org/wiki/Interrupt_flag) flag of the [eflags](https://en.wikipedia.org/wiki/FLAGS_register) register and disables interrupts on the local processor. The `local_irq_restore` macro defined in the same header file and does the opposite thing: restores the `interrupt flag` and enables interrupts. We disable interrupts here because a `softirq` interrupt runs in the interrupt context and that one softirq (and no others) will be run. -The `raise_softirq_irqoff` function marks the softirq as deffered by setting the bit corresponding to the given index `nr` in the `softirq` bit mask (`__softirq_pending`) of the local processor. It does it with the help of the: +The `raise_softirq_irqoff` function marks the softirq as deferred by setting the bit corresponding to the given index `nr` in the `softirq` bit mask (`__softirq_pending`) of the local processor. It does it with the help of the: ```C __raise_softirq_irqoff(nr); @@ -186,7 +186,7 @@ if (pending) { --max_restart) goto restart; } -... +... ``` Checks of the existence of the deferred interrupts are performed periodically. There are several points where these checks occur. The main point is the call of the `do_IRQ` function defined in [arch/x86/kernel/irq.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/irq.c), which provides the main means for actual interrupt processing in the Linux kernel. When `do_IRQ` finishes handling an interrupt, it calls the `exiting_irq` function from the [arch/x86/include/asm/apic.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/apic.h) that expands to the call of the `irq_exit` function. `irq_exit` checks for deferred interrupts and the current context and calls the `invoke_softirq` function: @@ -283,8 +283,9 @@ As we can see this structure contains five fields, they are: * Main callback of the tasklet; * Parameter of the callback. -In our case, we set only for initialize only two arrays of tasklets in the `softirq_init` function: the `tasklet_vec` and the `tasklet_hi_vec`. Tasklets and high-priority tasklets are stored in the `tasklet_vec` and `tasklet_hi_vec` arrays, respectively. So, we have initialized these arrays and now we can see two calls of the `open_softirq` function that is defined in the [kernel/softirq.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/softirq.c) source code file: - +In our case, we initialize only two per-CPU tasklet vectors: `tasklet_vec` for normal-priority tasklets and `tasklet_hi_vec` for high-priority tasklets. These vectors are implemented as linked lists, with each CPU maintaining its own instance. +After setting up the tasklet vectors, we register two softirq handlers using the `open_softirq` function that is defined in the [kernel/softirq.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/softirq.c) source code file: + ```C open_softirq(TASKLET_SOFTIRQ, tasklet_action); open_softirq(HI_SOFTIRQ, tasklet_hi_action); @@ -292,7 +293,7 @@ open_softirq(HI_SOFTIRQ, tasklet_hi_action); at the end of the `softirq_init` function. The main purpose of the `open_softirq` function is the initialization of `softirq`. Let's look on the implementation of the `open_softirq` function. -, in our case they are: `tasklet_action` and the `tasklet_hi_action` or the `softirq` function associated with the `HI_SOFTIRQ` softirq is named `tasklet_hi_action` and `softirq` function associated with the `TASKLET_SOFTIRQ` is named `tasklet_action`. The Linux kernel provides API for the manipulating of `tasklets`. First of all it is the `tasklet_init` function that takes `tasklet_struct`, function and parameter for it and initializes the given `tasklet_struct` with the given data: +In our case they are: `tasklet_action` and the `tasklet_hi_action` or the `softirq` function associated with the `HI_SOFTIRQ` softirq is named `tasklet_hi_action` and `softirq` function associated with the `TASKLET_SOFTIRQ` is named `tasklet_action`. The Linux kernel provides API for the manipulating of `tasklets`. First of all it is the `tasklet_init` function that takes `tasklet_struct`, function and parameter for it and initializes the given `tasklet_struct` with the given data: ```C void tasklet_init(struct tasklet_struct *t, @@ -310,7 +311,7 @@ There are additional methods to initialize a tasklet statically with the two fol ```C DECLARE_TASKLET(name, func, data); -DECLARE_TASKLET_DISABLED(name, func, data); +DECLARE_TASKLET_DISABLED(name, func, data); ``` The Linux kernel provides three following functions to mark a tasklet as ready to run: @@ -368,7 +369,7 @@ static void tasklet_action(struct softirq_action *a) } ``` -In the beginning of the `tasklet_action` function, we disable interrupts for the local processor with the help of the `local_irq_disable` macro (you can read about this macro in the second [part](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-2) of this chapter). In the next step, we take a head of the list that contains tasklets with normal priority and set this per-cpu list to `NULL` because all tasklets must be executed in a generally way. After this we enable interrupts for the local processor and go through the list of tasklets in the loop. In every iteration of the loop we call the `tasklet_trylock` function for the given tasklet that updates state of the given tasklet on `TASKLET_STATE_RUN`: +In the beginning of the `tasklet_action` function, we disable interrupts for the local processor with the help of the `local_irq_disable` macro (you can read about this macro in the second [part](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-2) of this chapter). In the next step, we take a head of the list that contains tasklets with normal priority and set this per-cpu list to `NULL` because all tasklets must be executed in a general way. After this we enable interrupts for the local processor and go through the list of tasklets in the loop. In every iteration of the loop we call the `tasklet_trylock` function for the given tasklet that updates state of the given tasklet on `TASKLET_STATE_RUN`: ```C static inline int tasklet_trylock(struct tasklet_struct *t) diff --git a/KernelStructures/linux-kernelstructure-1.md b/KernelStructures/linux-kernelstructure-1.md index 53743501..72377f3e 100644 --- a/KernelStructures/linux-kernelstructure-1.md +++ b/KernelStructures/linux-kernelstructure-1.md @@ -13,9 +13,9 @@ Types of Exceptions: * Traps - are precise exceptions reported on the boundary `following` the instruction causing the exception. The same with `%rip`; * Aborts - are imprecise exceptions. Because they are imprecise, aborts typically do not allow reliable program restart. -`Maskable` interrupts trigger the interrupt-handling mechanism only when RFLAGS.IF=1. Otherwise they are held pending for as long as the RFLAGS.IF bit is cleared to 0. +`Maskable` interrupts trigger the interrupt-handling mechanism only when `RFLAGS.IF=1`. Otherwise they are held pending for as long as the `RFLAGS.IF` bit is cleared to 0. -`Nonmaskable` interrupts (NMI) are unaffected by the value of the rFLAGS.IF bit. However, the occurrence of an NMI masks further NMIs until an IRET instruction is executed. +`Nonmaskable` interrupts (NMI) are unaffected by the value of the 'RFLAGS.IF' bit. However, the occurrence of an NMI masks further NMIs until an IRET instruction is executed. Specific exception and interrupt sources are assigned a fixed vector-identification number (also called an “interrupt vector” or simply “vector”). The interrupt vector is used by the interrupt-handling mechanism to locate the system-software service routine assigned to the exception or interrupt. Up to 256 unique interrupt vectors are available. The first 32 vectors are reserved for predefined exception and interrupt conditions. They are defined in the [arch/x86/include/asm/traps.h](http://lxr.free-electrons.com/source/arch/x86/include/asm/traps.h#L121) header file: diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..d60efe46 --- /dev/null +++ b/LICENSE @@ -0,0 +1,437 @@ +Attribution-NonCommercial-ShareAlike 4.0 International + +======================================================================= + +Creative Commons Corporation ("Creative Commons") is not a law firm and +does not provide legal services or legal advice. Distribution of +Creative Commons public licenses does not create a lawyer-client or +other relationship. Creative Commons makes its licenses and related +information available on an "as-is" basis. Creative Commons gives no +warranties regarding its licenses, any material licensed under their +terms and conditions, or any related information. Creative Commons +disclaims all liability for damages resulting from their use to the +fullest extent possible. + +Using Creative Commons Public Licenses + +Creative Commons public licenses provide a standard set of terms and +conditions that creators and other rights holders may use to share +original works of authorship and other material subject to copyright +and certain other rights specified in the public license below. The +following considerations are for informational purposes only, are not +exhaustive, and do not form part of our licenses. + + Considerations for licensors: Our public licenses are + intended for use by those authorized to give the public + permission to use material in ways otherwise restricted by + copyright and certain other rights. Our licenses are + irrevocable. Licensors should read and understand the terms + and conditions of the license they choose before applying it. + Licensors should also secure all rights necessary before + applying our licenses so that the public can reuse the + material as expected. Licensors should clearly mark any + material not subject to the license. This includes other CC- + licensed material, or material used under an exception or + limitation to copyright. More considerations for licensors: + wiki.creativecommons.org/Considerations_for_licensors + + Considerations for the public: By using one of our public + licenses, a licensor grants the public permission to use the + licensed material under specified terms and conditions. If + the licensor's permission is not necessary for any reason--for + example, because of any applicable exception or limitation to + copyright--then that use is not regulated by the license. Our + licenses grant only permissions under copyright and certain + other rights that a licensor has authority to grant. Use of + the licensed material may still be restricted for other + reasons, including because others have copyright or other + rights in the material. A licensor may make special requests, + such as asking that all changes be marked or described. + Although not required by our licenses, you are encouraged to + respect those requests where reasonable. More_considerations + for the public: + wiki.creativecommons.org/Considerations_for_licensees + +======================================================================= + +Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International +Public License + +By exercising the Licensed Rights (defined below), You accept and agree +to be bound by the terms and conditions of this Creative Commons +Attribution-NonCommercial-ShareAlike 4.0 International Public License +("Public License"). To the extent this Public License may be +interpreted as a contract, You are granted the Licensed Rights in +consideration of Your acceptance of these terms and conditions, and the +Licensor grants You such rights in consideration of benefits the +Licensor receives from making the Licensed Material available under +these terms and conditions. + + +Section 1 -- Definitions. + + a. Adapted Material means material subject to Copyright and Similar + Rights that is derived from or based upon the Licensed Material + and in which the Licensed Material is translated, altered, + arranged, transformed, or otherwise modified in a manner requiring + permission under the Copyright and Similar Rights held by the + Licensor. For purposes of this Public License, where the Licensed + Material is a musical work, performance, or sound recording, + Adapted Material is always produced where the Licensed Material is + synched in timed relation with a moving image. + + b. Adapter's License means the license You apply to Your Copyright + and Similar Rights in Your contributions to Adapted Material in + accordance with the terms and conditions of this Public License. + + c. BY-NC-SA Compatible License means a license listed at + creativecommons.org/compatiblelicenses, approved by Creative + Commons as essentially the equivalent of this Public License. + + d. Copyright and Similar Rights means copyright and/or similar rights + closely related to copyright including, without limitation, + performance, broadcast, sound recording, and Sui Generis Database + Rights, without regard to how the rights are labeled or + categorized. For purposes of this Public License, the rights + specified in Section 2(b)(1)-(2) are not Copyright and Similar + Rights. + + e. Effective Technological Measures means those measures that, in the + absence of proper authority, may not be circumvented under laws + fulfilling obligations under Article 11 of the WIPO Copyright + Treaty adopted on December 20, 1996, and/or similar international + agreements. + + f. Exceptions and Limitations means fair use, fair dealing, and/or + any other exception or limitation to Copyright and Similar Rights + that applies to Your use of the Licensed Material. + + g. License Elements means the license attributes listed in the name + of a Creative Commons Public License. The License Elements of this + Public License are Attribution, NonCommercial, and ShareAlike. + + h. Licensed Material means the artistic or literary work, database, + or other material to which the Licensor applied this Public + License. + + i. Licensed Rights means the rights granted to You subject to the + terms and conditions of this Public License, which are limited to + all Copyright and Similar Rights that apply to Your use of the + Licensed Material and that the Licensor has authority to license. + + j. Licensor means the individual(s) or entity(ies) granting rights + under this Public License. + + k. NonCommercial means not primarily intended for or directed towards + commercial advantage or monetary compensation. For purposes of + this Public License, the exchange of the Licensed Material for + other material subject to Copyright and Similar Rights by digital + file-sharing or similar means is NonCommercial provided there is + no payment of monetary compensation in connection with the + exchange. + + l. Share means to provide material to the public by any means or + process that requires permission under the Licensed Rights, such + as reproduction, public display, public performance, distribution, + dissemination, communication, or importation, and to make material + available to the public including in ways that members of the + public may access the material from a place and at a time + individually chosen by them. + + m. Sui Generis Database Rights means rights other than copyright + resulting from Directive 96/9/EC of the European Parliament and of + the Council of 11 March 1996 on the legal protection of databases, + as amended and/or succeeded, as well as other essentially + equivalent rights anywhere in the world. + + n. You means the individual or entity exercising the Licensed Rights + under this Public License. Your has a corresponding meaning. + + +Section 2 -- Scope. + + a. License grant. + + 1. Subject to the terms and conditions of this Public License, + the Licensor hereby grants You a worldwide, royalty-free, + non-sublicensable, non-exclusive, irrevocable license to + exercise the Licensed Rights in the Licensed Material to: + + a. reproduce and Share the Licensed Material, in whole or + in part, for NonCommercial purposes only; and + + b. produce, reproduce, and Share Adapted Material for + NonCommercial purposes only. + + 2. Exceptions and Limitations. For the avoidance of doubt, where + Exceptions and Limitations apply to Your use, this Public + License does not apply, and You do not need to comply with + its terms and conditions. + + 3. Term. The term of this Public License is specified in Section + 6(a). + + 4. Media and formats; technical modifications allowed. The + Licensor authorizes You to exercise the Licensed Rights in + all media and formats whether now known or hereafter created, + and to make technical modifications necessary to do so. The + Licensor waives and/or agrees not to assert any right or + authority to forbid You from making technical modifications + necessary to exercise the Licensed Rights, including + technical modifications necessary to circumvent Effective + Technological Measures. For purposes of this Public License, + simply making modifications authorized by this Section 2(a) + (4) never produces Adapted Material. + + 5. Downstream recipients. + + a. Offer from the Licensor -- Licensed Material. Every + recipient of the Licensed Material automatically + receives an offer from the Licensor to exercise the + Licensed Rights under the terms and conditions of this + Public License. + + b. Additional offer from the Licensor -- Adapted Material. + Every recipient of Adapted Material from You + automatically receives an offer from the Licensor to + exercise the Licensed Rights in the Adapted Material + under the conditions of the Adapter's License You apply. + + c. No downstream restrictions. You may not offer or impose + any additional or different terms or conditions on, or + apply any Effective Technological Measures to, the + Licensed Material if doing so restricts exercise of the + Licensed Rights by any recipient of the Licensed + Material. + + 6. No endorsement. Nothing in this Public License constitutes or + may be construed as permission to assert or imply that You + are, or that Your use of the Licensed Material is, connected + with, or sponsored, endorsed, or granted official status by, + the Licensor or others designated to receive attribution as + provided in Section 3(a)(1)(A)(i). + + b. Other rights. + + 1. Moral rights, such as the right of integrity, are not + licensed under this Public License, nor are publicity, + privacy, and/or other similar personality rights; however, to + the extent possible, the Licensor waives and/or agrees not to + assert any such rights held by the Licensor to the limited + extent necessary to allow You to exercise the Licensed + Rights, but not otherwise. + + 2. Patent and trademark rights are not licensed under this + Public License. + + 3. To the extent possible, the Licensor waives any right to + collect royalties from You for the exercise of the Licensed + Rights, whether directly or through a collecting society + under any voluntary or waivable statutory or compulsory + licensing scheme. In all other cases the Licensor expressly + reserves any right to collect such royalties, including when + the Licensed Material is used other than for NonCommercial + purposes. + + +Section 3 -- License Conditions. + +Your exercise of the Licensed Rights is expressly made subject to the +following conditions. + + a. Attribution. + + 1. If You Share the Licensed Material (including in modified + form), You must: + + a. retain the following if it is supplied by the Licensor + with the Licensed Material: + + i. identification of the creator(s) of the Licensed + Material and any others designated to receive + attribution, in any reasonable manner requested by + the Licensor (including by pseudonym if + designated); + + ii. a copyright notice; + + iii. a notice that refers to this Public License; + + iv. a notice that refers to the disclaimer of + warranties; + + v. a URI or hyperlink to the Licensed Material to the + extent reasonably practicable; + + b. indicate if You modified the Licensed Material and + retain an indication of any previous modifications; and + + c. indicate the Licensed Material is licensed under this + Public License, and include the text of, or the URI or + hyperlink to, this Public License. + + 2. You may satisfy the conditions in Section 3(a)(1) in any + reasonable manner based on the medium, means, and context in + which You Share the Licensed Material. For example, it may be + reasonable to satisfy the conditions by providing a URI or + hyperlink to a resource that includes the required + information. + 3. If requested by the Licensor, You must remove any of the + information required by Section 3(a)(1)(A) to the extent + reasonably practicable. + + b. ShareAlike. + + In addition to the conditions in Section 3(a), if You Share + Adapted Material You produce, the following conditions also apply. + + 1. The Adapter's License You apply must be a Creative Commons + license with the same License Elements, this version or + later, or a BY-NC-SA Compatible License. + + 2. You must include the text of, or the URI or hyperlink to, the + Adapter's License You apply. You may satisfy this condition + in any reasonable manner based on the medium, means, and + context in which You Share Adapted Material. + + 3. You may not offer or impose any additional or different terms + or conditions on, or apply any Effective Technological + Measures to, Adapted Material that restrict exercise of the + rights granted under the Adapter's License You apply. + + +Section 4 -- Sui Generis Database Rights. + +Where the Licensed Rights include Sui Generis Database Rights that +apply to Your use of the Licensed Material: + + a. for the avoidance of doubt, Section 2(a)(1) grants You the right + to extract, reuse, reproduce, and Share all or a substantial + portion of the contents of the database for NonCommercial purposes + only; + + b. if You include all or a substantial portion of the database + contents in a database in which You have Sui Generis Database + Rights, then the database in which You have Sui Generis Database + Rights (but not its individual contents) is Adapted Material, + including for purposes of Section 3(b); and + + c. You must comply with the conditions in Section 3(a) if You Share + all or a substantial portion of the contents of the database. + +For the avoidance of doubt, this Section 4 supplements and does not +replace Your obligations under this Public License where the Licensed +Rights include other Copyright and Similar Rights. + + +Section 5 -- Disclaimer of Warranties and Limitation of Liability. + + a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE + EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS + AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF + ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, + IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, + WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR + PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, + ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT + KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT + ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. + + b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE + TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, + NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, + INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, + COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR + USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN + ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR + DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR + IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. + + c. The disclaimer of warranties and limitation of liability provided + above shall be interpreted in a manner that, to the extent + possible, most closely approximates an absolute disclaimer and + waiver of all liability. + + +Section 6 -- Term and Termination. + + a. This Public License applies for the term of the Copyright and + Similar Rights licensed here. However, if You fail to comply with + this Public License, then Your rights under this Public License + terminate automatically. + + b. Where Your right to use the Licensed Material has terminated under + Section 6(a), it reinstates: + + 1. automatically as of the date the violation is cured, provided + it is cured within 30 days of Your discovery of the + violation; or + + 2. upon express reinstatement by the Licensor. + + For the avoidance of doubt, this Section 6(b) does not affect any + right the Licensor may have to seek remedies for Your violations + of this Public License. + + c. For the avoidance of doubt, the Licensor may also offer the + Licensed Material under separate terms or conditions or stop + distributing the Licensed Material at any time; however, doing so + will not terminate this Public License. + + d. Sections 1, 5, 6, 7, and 8 survive termination of this Public + License. + + +Section 7 -- Other Terms and Conditions. + + a. The Licensor shall not be bound by any additional or different + terms or conditions communicated by You unless expressly agreed. + + b. Any arrangements, understandings, or agreements regarding the + Licensed Material not stated herein are separate from and + independent of the terms and conditions of this Public License. + + +Section 8 -- Interpretation. + + a. For the avoidance of doubt, this Public License does not, and + shall not be interpreted to, reduce, limit, restrict, or impose + conditions on any use of the Licensed Material that could lawfully + be made without permission under this Public License. + + b. To the extent possible, if any provision of this Public License is + deemed unenforceable, it shall be automatically reformed to the + minimum extent necessary to make it enforceable. If the provision + cannot be reformed, it shall be severed from this Public License + without affecting the enforceability of the remaining terms and + conditions. + + c. No term or condition of this Public License will be waived and no + failure to comply consented to unless expressly agreed to by the + Licensor. + + d. Nothing in this Public License constitutes or may be interpreted + as a limitation upon, or waiver of, any privileges and immunities + that apply to the Licensor or You, including from the legal + processes of any jurisdiction or authority. + +======================================================================= + +Creative Commons is not a party to its public +licenses. Notwithstanding, Creative Commons may elect to apply one of +its public licenses to material it publishes and in those instances +will be considered the “Licensor.” The text of the Creative Commons +public licenses is dedicated to the public domain under the CC0 Public +Domain Dedication. Except for the limited purpose of indicating that +material is shared under a Creative Commons public license or as +otherwise permitted by the Creative Commons policies published at +creativecommons.org/policies, Creative Commons does not authorize the +use of the trademark "Creative Commons" or any other trademark or logo +of Creative Commons without its prior written consent including, +without limitation, in connection with any unauthorized modifications +to any of its public licenses or any other arrangements, +understandings, or agreements concerning use of licensed material. For +the avoidance of doubt, this paragraph does not form part of the +public licenses. + +Creative Commons may be contacted at creativecommons.org. diff --git a/MM/README.md b/MM/README.md index fbf3d7af..d5f5dada 100644 --- a/MM/README.md +++ b/MM/README.md @@ -1,7 +1,7 @@ # Linux kernel memory management -This chapter describes memory management in the linux kernel. You will see here a -couple of posts which describe different parts of the linux memory management framework: +This chapter describes memory management in the Linux kernel. You will see here a +couple of posts which describe different parts of the Linux memory management framework: * [Memblock](linux-mm-1.md) - describes early `memblock` allocator. * [Fix-Mapped Addresses and ioremap](linux-mm-2.md) - describes `fix-mapped` addresses and early `ioremap`. diff --git a/MM/images/kernel_configuration_menu1.png b/MM/images/kernel_configuration_menu1.png index 54ef0ac5..bcf59f29 100644 Binary files a/MM/images/kernel_configuration_menu1.png and b/MM/images/kernel_configuration_menu1.png differ diff --git a/MM/images/kernel_configuration_menu2.png b/MM/images/kernel_configuration_menu2.png index 6981c94c..aa8d8912 100644 Binary files a/MM/images/kernel_configuration_menu2.png and b/MM/images/kernel_configuration_menu2.png differ diff --git a/MM/images/memblock.png b/MM/images/memblock.png index 6e6279bc..6bfe5969 100644 Binary files a/MM/images/memblock.png and b/MM/images/memblock.png differ diff --git a/MM/linux-mm-1.md b/MM/linux-mm-1.md index b34ba96e..79db4440 100644 --- a/MM/linux-mm-1.md +++ b/MM/linux-mm-1.md @@ -4,7 +4,7 @@ Linux kernel memory management Part 1. Introduction -------------------------------------------------------------------------------- -Memory management is one of the most complex (and I think that it is the most complex) part of the operating system kernel. In the [last preparations before the kernel entry point](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-3) part we stopped right before call of the `start_kernel` function. This function initializes all the kernel features (including architecture-dependent features) before the kernel runs the first `init` process. You may remember as we built early page tables, identity page tables and fixmap page tables in the boot time. No complicated memory management is working yet. When the `start_kernel` function is called we will see the transition to more complex data structures and techniques for memory management. For a good understanding of the initialization process in the linux kernel we need to have a clear understanding of these techniques. This chapter will provide an overview of the different parts of the linux kernel memory management framework and its API, starting from the `memblock`. +Memory management is one of the most complex (and I think that it is the most complex) part of the operating system kernel. In the [last preparations before the kernel entry point](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-3) part we stopped right before call of the `start_kernel` function. This function initializes all the kernel features (including architecture-dependent features) before the kernel runs the first `init` process. You may remember as we built early page tables, identity page tables and fixmap page tables in the boot time. No complicated memory management is working yet. When the `start_kernel` function is called we will see the transition to more complex data structures and techniques for memory management. For a good understanding of the initialization process in the Linux kernel we need to have a clear understanding of these techniques. This chapter will provide an overview of the different parts of the linux kernel memory management framework and its API, starting from the `memblock`. Memblock -------------------------------------------------------------------------------- @@ -155,7 +155,7 @@ On this step the initialization of the `memblock` structure has been finished an Memblock API -------------------------------------------------------------------------------- -Ok we have finished with the initialization of the `memblock` structure and now we can look at the Memblock API and its implementation. As I said above, the implementation of `memblock` is taking place fully in [mm/memblock.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/mm/memblock.c). To understand how `memblock` works and how it is implemented, let's look at its usage first. There are a couple of [places](http://lxr.free-electrons.com/ident?i=memblock) in the linux kernel where memblock is used. For example let's take `memblock_x86_fill` function from the [arch/x86/kernel/e820.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/e820.c#L1061). This function goes through the memory map provided by the [e820](http://en.wikipedia.org/wiki/E820) and adds memory regions reserved by the kernel to the `memblock` with the `memblock_add` function. Since we have met the `memblock_add` function first, let's start from it. +Ok we have finished with the initialization of the `memblock` structure and now we can look at the Memblock API and its implementation. As I said above, the implementation of `memblock` is taking place fully in [mm/memblock.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/mm/memblock.c). To understand how `memblock` works and how it is implemented, let's look at its usage first. There are a couple of [places](http://lxr.free-electrons.com/ident?i=memblock) in the Linux kernel where memblock is used. For example let's take `memblock_x86_fill` function from the [arch/x86/kernel/e820.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/e820.c#L1061). This function goes through the memory map provided by the [e820](http://en.wikipedia.org/wiki/E820) and adds memory regions reserved by the kernel to the `memblock` with the `memblock_add` function. Since we have met the `memblock_add` function first, let's start from it. This function takes a physical base address and the size of the memory region as arguments and add them to the `memblock`. The `memblock_add` function does not do anything special in its body, but just calls the: @@ -163,7 +163,7 @@ This function takes a physical base address and the size of the memory region as memblock_add_range(&memblock.memory, base, size, MAX_NUMNODES, 0); ``` -function. We pass the memory block type - `memory`, the physical base address and the size of the memory region, the maximum number of nodes which is 1 if `CONFIG_NODES_SHIFT` is not set in the configuration file or `1 << CONFIG_NODES_SHIFT` if it is set, and the flags. The `memblock_add_range` function adds a new memory region to the memory block. It starts by checking the size of the given region and if it is zero it just returns. After this, `memblock_add_range` checks the existence of the memory regions in the `memblock` structure with the given `memblock_type`. If there are no memory regions, we just fill a new `memory_region` with the given values and return (we already saw the implementation of this in the [First touch of the linux kernel memory manager framework](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-3)). If `memblock_type` is not empty, we start to add a new memory region to the `memblock` with the given `memblock_type`. +function. We pass the memory block type - `memory`, the physical base address and the size of the memory region, the maximum number of nodes which is 1 if `CONFIG_NODES_SHIFT` is not set in the configuration file or `1 << CONFIG_NODES_SHIFT` if it is set, and the flags. The `memblock_add_range` function adds a new memory region to the memory block. It starts by checking the size of the given region and if it is zero it just returns. After this, `memblock_add_range` checks the existence of the memory regions in the `memblock` structure with the given `memblock_type`. If there are no memory regions, we just fill a new `memory_region` with the given values and return (we already saw the implementation of this in the [First touch of the Linux kernel memory manager framework](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-3)). If `memblock_type` is not empty, we start to add a new memory region to the `memblock` with the given `memblock_type`. First of all we get the end of the memory region with the: @@ -171,7 +171,7 @@ First of all we get the end of the memory region with the: phys_addr_t end = base + memblock_cap_size(base, &size); ``` -`memblock_cap_size` adjusts `size` that `base + size` will not overflow. Its implementation is pretty easy: +`memblock_cap_size` adjusts `size` so that `base + size` will not overflow. Its implementation is pretty easy: ```C static inline phys_addr_t memblock_cap_size(phys_addr_t base, phys_addr_t *size) @@ -337,10 +337,10 @@ There is also `memblock_reserve` function which does the same as `memblock_add`, Of course this is not the full API. Memblock provides APIs not only for adding `memory` and `reserved` memory regions, but also: -* memblock_remove - removes memory region from memblock; -* memblock_find_in_range - finds free area in given range; -* memblock_free - releases memory region in memblock; -* for_each_mem_range - iterates through memblock areas. +* `memblock_remove` - removes memory region from memblock; +* `memblock_find_in_range` - finds free area in given range; +* `memblock_free` - releases memory region in memblock; +* `for_each_mem_range` - iterates through memblock areas. and many more.... @@ -349,8 +349,8 @@ Getting info about memory regions Memblock also provides an API for getting information about allocated memory regions in the `memblock`. It is split in two parts: -* get_allocated_memblock_memory_regions_info - getting info about memory regions; -* get_allocated_memblock_reserved_regions_info - getting info about reserved regions. +* `get_allocated_memblock_memory_regions_info` - getting info about memory regions; +* `get_allocated_memblock_reserved_regions_info` - getting info about reserved regions. Implementation of these functions is easy. Let's look at `get_allocated_memblock_reserved_regions_info` for example: @@ -401,16 +401,16 @@ And you will see something like this: Memblock also has support in [debugfs](http://en.wikipedia.org/wiki/Debugfs). If you run the kernel on another architecture than `X86` you can access: -* /sys/kernel/debug/memblock/memory -* /sys/kernel/debug/memblock/reserved -* /sys/kernel/debug/memblock/physmem +* `/sys/kernel/debug/memblock/memory` +* `/sys/kernel/debug/memblock/reserved` +* `/sys/kernel/debug/memblock/physmem` to get a dump of the `memblock` contents. Conclusion -------------------------------------------------------------------------------- -This is the end of the first part about linux kernel memory management. If you have questions or suggestions, ping me on twitter [0xAX](https://twitter.com/0xAX), drop me an [email](mailto:anotherworldofworld@gmail.com) or just create an [issue](https://github.com/0xAX/linux-insides/issues/new). +This is the end of the first part about Linux kernel memory management. If you have questions or suggestions, ping me on twitter [0xAX](https://twitter.com/0xAX), drop me an [email](mailto:anotherworldofworld@gmail.com) or just create an [issue](https://github.com/0xAX/linux-insides/issues/new). **Please note that English is not my first language and I am really sorry for any inconvenience. If you found any mistakes please send me a PR to [linux-insides](https://github.com/0xAX/linux-insides).** @@ -420,4 +420,4 @@ Links * [e820](http://en.wikipedia.org/wiki/E820) * [numa](http://en.wikipedia.org/wiki/Non-uniform_memory_access) * [debugfs](http://en.wikipedia.org/wiki/Debugfs) -* [First touch of the linux kernel memory manager framework](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-3) +* [First touch of the Linux kernel memory manager framework](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-3) diff --git a/MM/linux-mm-2.md b/MM/linux-mm-2.md index 2b2fca0f..dd54943e 100644 --- a/MM/linux-mm-2.md +++ b/MM/linux-mm-2.md @@ -36,9 +36,9 @@ Base virtual address and size of the `fix-mapped` area are presented by the two #define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) ``` -Here `__end_of_permanent_fixed_addresses` is an element of the `fixed_addresses` enum and as I wrote above: Every fix-mapped address is represented by an integer index which is defined in the `fixed_addresses`. `PAGE_SHIFT` determines the size of a page. For example size of the one page we can get with the `1 << PAGE_SHIFT` expression. +Here `__end_of_permanent_fixed_addresses` is an element of the `fixed_addresses` enum and as I wrote above, every fix-mapped address is represented by an integer index which is defined in the `fixed_addresses`. `PAGE_SHIFT` determines the size of a page. For example size of the one page we can get with the `1 << PAGE_SHIFT` expression. -In our case we need to get the size of the fix-mapped area, but not only of one page, that's why we are using `__end_of_permanent_fixed_addresses` for getting the size of the fix-mapped area. The `__end_of_permanent_fixed_addresses` is the last index of the `fixed_addresses` enum or in other words the `__end_of_permanent_fixed_addresses` contains amount of pages in a fixed-mapped area. So if multiply value of the `__end_of_permanent_fixed_addresses` on a page size value we will get size of fix-mapped area. In my case it's a little more than `536` kilobytes. In your case it might be a different number, because the size depends on amount of the fix-mapped addresses which are depends on your kernel's configuration. +In our case we need to get the size of the fix-mapped area, but not only of one page, that's why we are using `__end_of_permanent_fixed_addresses` for getting the size of the fix-mapped area. The `__end_of_permanent_fixed_addresses` is the last index of the `fixed_addresses` enum or in other words the `__end_of_permanent_fixed_addresses` contains amount of pages in a fixed-mapped area. So if we multiply the value of the `__end_of_permanent_fixed_addresses` on a page size value we will get size of fix-mapped area. In my case it's a little more than `536` kilobytes. In your case it might be a different number, because the size depends on amount of the fix-mapped addresses which depends on your kernel configuration. The second `FIXADDR_START` macro just subtracts the fix-mapped area size from the last address of the fix-mapped area to get its base virtual address. `FIXADDR_TOP` is a rounded up address from the base address of the [vsyscall](https://lwn.net/Articles/446528/) space: @@ -46,8 +46,8 @@ The second `FIXADDR_START` macro just subtracts the fix-mapped area size from th #define FIXADDR_TOP (round_up(VSYSCALL_ADDR + PAGE_SIZE, 1< Memory Debugging ``` - + menu of the Linux kernel configuration: ![kernel configuration menu](images/kernel_configuration_menu1.png) @@ -140,7 +140,7 @@ config X86 ... ``` -So, there is no anything which is specific for other architectures. +So, there isn't anything which is specific for other architectures. Ok, so we know that `kmemcheck` provides mechanism to check usage of `uninitialized memory` in the Linux kernel and how to enable it. How it does these checks? When the Linux kernel tries to allocate some memory i.e. something is called like this: @@ -148,7 +148,7 @@ Ok, so we know that `kmemcheck` provides mechanism to check usage of `uninitiali struct my_struct *my_struct = kmalloc(sizeof(struct my_struct), GFP_KERNEL); ``` -or in other words somebody wants to access a [page](https://en.wikipedia.org/wiki/Page_%28computer_memory%29), a [page fault](https://en.wikipedia.org/wiki/Page_fault) exception is generated. This is achieved by the fact that the `kmemcheck` marks memory pages as `non-present` (more about this you can read in the special part which is devoted to [Paging](https://0xax.gitbook.io/linux-insides/summary/theory/linux-theory-1)). If a `page fault` exception is occurred, the exception handler knows about it and in a case when the `kmemcheck` is enabled it transfers control to it. After the `kmemcheck` will finish its checks, the page will be marked as `present` and the interrupted code will be able to continue execution. There is little subtlety in this chain. When the first instruction of interrupted code will be executed, the `kmemcheck` will mark the page as `non-present` again. In this way next access to memory will be caught again. +or in other words somebody wants to access a [page](https://en.wikipedia.org/wiki/Page_%28computer_memory%29), a [page fault](https://en.wikipedia.org/wiki/Page_fault) exception is generated. This is achieved by the fact that the `kmemcheck` marks memory pages as `non-present` (more about this you can read in the special part which is devoted to [Paging](https://0xax.gitbook.io/linux-insides/summary/theory/linux-theory-1)). If a `page fault` exception occurred, the exception handler knows about it and in a case when the `kmemcheck` is enabled it transfers control to it. After the `kmemcheck` will finish its checks, the page will be marked as `present` and the interrupted code will be able to continue execution. There is little subtlety in this chain. When the first instruction of interrupted code will be executed, the `kmemcheck` will mark the page as `non-present` again. In this way next access to memory will be caught again. We just considered the `kmemcheck` mechanism from theoretical side. Now let's consider how it is implemented in the Linux kernel. @@ -215,9 +215,9 @@ if (!kmemcheck_selftest()) { printk(KERN_INFO "kmemcheck: Initialized\n"); ``` -and return with the `EINVAL` if this check is failed. The `kmemcheck_selftest` function checks sizes of different memory access related [opcodes](https://en.wikipedia.org/wiki/Opcode) like `rep movsb`, `movzwq` and etc. If sizes of opcodes are equal to expected sizes, the `kmemcheck_selftest` will return `true` and `false` in other way. +and return with the `EINVAL` if this check is failed. The `kmemcheck_selftest` function checks sizes of different memory access related [opcodes](https://en.wikipedia.org/wiki/Opcode) like `rep movsb`, `movzwq` and etc. If sizes of opcodes are equal to expected sizes, the `kmemcheck_selftest` will return `true` and `false` otherwise. -So when the somebody will call: +So when somebody calls: ```C struct my_struct *my_struct = kmalloc(sizeof(struct my_struct), GFP_KERNEL); @@ -236,7 +236,7 @@ if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { } ``` -So, here we check that the if `kmemcheck` is enabled and the `SLAB_NOTRACK` bit is not set in flags we set `non-present` bit for the just allocated page. The `SLAB_NOTRACK` bit tell us to not track uninitialized memory. Additionally we check if a cache object has constructor (details will be considered in next parts) we mark allocated page as uninitialized or unallocated in other way. The `kmemcheck_alloc_shadow` function is defined in the [mm/kmemcheck.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/mm/kmemcheck.c) source code file and does following things: +So, here we check that the if `kmemcheck` is enabled and the `SLAB_NOTRACK` bit is not set in flags we set `non-present` bit for the just allocated page. The `SLAB_NOTRACK` bit tell us to not track uninitialized memory. Additionally we check if a cache object has constructor (details will be considered in next parts) we mark allocated page as uninitialized or unallocated otherwise. The `kmemcheck_alloc_shadow` function is defined in the [mm/kmemcheck.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/mm/kmemcheck.c) source code file and does following things: ```C void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node) @@ -276,7 +276,7 @@ void kmemcheck_hide_pages(struct page *p, unsigned int n) } ``` -Here we go through all pages and and tries to get `page table entry` for each page. If this operation was successful, we unset present bit and set hidden bit in each page. In the end we flush [translation lookaside buffer](https://en.wikipedia.org/wiki/Translation_lookaside_buffer), because some pages was changed. From this point allocated pages are tracked by the `kmemcheck`. Now, as `present` bit is unset, the [page fault](https://en.wikipedia.org/wiki/Page_fault) execution will be occurred right after the `kmalloc` will return pointer to allocated space and a code will try to access this memory. +Here we go through all pages and try to get `page table entry` for each page. If this operation was successful, we unset present bit and set hidden bit in each page. In the end we flush [translation lookaside buffer](https://en.wikipedia.org/wiki/Translation_lookaside_buffer), because some pages was changed. From this point allocated pages are tracked by the `kmemcheck`. Now, as `present` bit is unset, the [page fault](https://en.wikipedia.org/wiki/Page_fault) execution will be occurred right after the `kmalloc` will return pointer to allocated space and a code will try to access this memory. As you may remember from the [second part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-2) of the Linux kernel initialization chapter, the `page fault` handler is located in the [arch/x86/mm/fault.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/mm/fault.c) source code file and represented by the `do_page_fault` function. We can see following check from the beginning of the `do_page_fault` function: @@ -296,7 +296,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, } ``` -The `kmemcheck_active` gets `kmemcheck_context` [per-cpu](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-1) structure and return the result of comparison of the `balance` field of this structure with zero: +The `kmemcheck_active` gets `kmemcheck_context` [per-cpu](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-1) structure and returns the result of comparison of the `balance` field of this structure with zero: ``` bool kmemcheck_active(struct pt_regs *regs) @@ -314,7 +314,7 @@ if (kmemcheck_fault(regs, address, error_code)) return; ``` -First of all the `kmemcheck_fault` function checks that the fault was occurred by the correct reason. At first we check the [flags register](https://en.wikipedia.org/wiki/FLAGS_register) and check that we are in normal kernel mode: +First of all the `kmemcheck_fault` function checks that the fault occurred by the correct reason. At first we check the [flags register](https://en.wikipedia.org/wiki/FLAGS_register) and check that we are in normal kernel mode: ```C if (regs->flags & X86_VM_MASK) @@ -323,7 +323,7 @@ if (regs->cs != __KERNEL_CS) return false; ``` -If these checks wasn't successful we return from the `kmemcheck_fault` function as it was not `kmemcheck` related page fault. After this we try to lookup a `page table entry` related to the faulted address and if we can't find it we return: +If these checks weren't successful we return from the `kmemcheck_fault` function as it was not `kmemcheck` related page fault. After this we try to lookup a `page table entry` related to the faulted address and if we can't find it we return: ```C pte = kmemcheck_pte_lookup(address); @@ -331,7 +331,7 @@ if (!pte) return false; ``` -Last two steps of the `kmemcheck_fault` function is to call the `kmemcheck_access` function which check access to the given page and show addresses again by setting present bit in the given page. The `kmemcheck_access` function does all main job. It check current instruction which caused a page fault. If it will find an error, the context of this error will be saved by `kmemcheck` to the ring queue: +Last two steps of the `kmemcheck_fault` function is to call the `kmemcheck_access` function which check access to the given page and show addresses again by setting present bit in the given page. The `kmemcheck_access` function does all main job. It checks current instruction which caused a page fault. If it finds an error, the context of this error will be saved by `kmemcheck` to the ring queue: ```C static struct kmemcheck_error error_fifo[CONFIG_KMEMCHECK_QUEUE_SIZE]; @@ -343,7 +343,7 @@ The `kmemcheck` mechanism declares special [tasklet](https://0xax.gitbook.io/lin static DECLARE_TASKLET(kmemcheck_tasklet, &do_wakeup, 0); ``` -which runs the `do_wakeup` function from the [arch/x86/mm/kmemcheck/error.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/mm/kmemcheck/error.c) source code file when it will be scheduled to run. +which runs the `do_wakeup` function from the [arch/x86/mm/kmemcheck/error.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/mm/kmemcheck/error.c) source code file when it is scheduled to run. The `do_wakeup` function will call the `kmemcheck_error_recall` function which will print errors collected by `kmemcheck`. As we already saw the: @@ -410,7 +410,7 @@ That's all. Conclusion -------------------------------------------------------------------------------- -This is the end of the third part about linux kernel [memory management](https://en.wikipedia.org/wiki/Memory_management). If you have questions or suggestions, ping me on twitter [0xAX](https://twitter.com/0xAX), drop me an [email](mailto:anotherworldofworld@gmail.com) or just create an [issue](https://github.com/0xAX/linux-insides/issues/new). In the next part we will see yet another memory debugging related tool - `kmemleak`. +This is the end of the third part about Linux kernel [memory management](https://en.wikipedia.org/wiki/Memory_management). If you have questions or suggestions, ping me on twitter [0xAX](https://twitter.com/0xAX), drop me an [email](mailto:anotherworldofworld@gmail.com) or just create an [issue](https://github.com/0xAX/linux-insides/issues/new). In the next part we will see yet another memory debugging related tool - `kmemleak`. **Please note that English is not my first language and I am really sorry for any inconvenience. If you found any mistakes please send me a PR to [linux-insides](https://github.com/0xAX/linux-insides).** diff --git a/Makefile b/Makefile new file mode 100644 index 00000000..6777b2f1 --- /dev/null +++ b/Makefile @@ -0,0 +1,79 @@ +### HELP + +.PHONY: help +help: ## Print help + @egrep "(^### |^\S+:.*##\s)" Makefile | sed 's/^###\s*//' | sed 's/^\(\S*\)\:.*##\s*\(.*\)/ \1 - \2/' + +### DOCKER + +.PHONY: run +run: image ## docker run ... + (docker stop linux-insides-book 2>&1) > /dev/null || true + docker run --detach -p 4000:4000 --name linux-insides-book --hostname linux-insides-book linux-insides-book + +.PHONY: start +start: ## start the docker container ... + docker start linux-insides-book + +.PHONY: image +image: ## docker image build ... + docker image build --rm --squash --label linux-insides --tag linux-insides-book:latest -f Dockerfile . 2> /dev/null || \ + docker image build --rm --label linux-insides --tag linux-insides-book:latest -f Dockerfile . + +.PHONY: sh +sh: ## run interactive shell inside an already running docker container ... + docker exec -it linux-insides-book bash + +.PHONY: rm +rm: ## remove the docker container ... + (docker stop linux-insides-book 2>&1) > /dev/null || true + (docker rm linux-insides-book 2>&1) > /dev/null || true + +.PHONY: logs +logs: ## gather logs from the docker container ... + docker logs linux-insides-book + +.PHONY: export +export: ## run e-book generation inside an already running docker container ... + docker exec linux-insides-book /bin/bash -c ' \ + find . -type f -name '*.svg' -a ! \( -path "./.github/*" -o -path "./_book/*" \) -print0 | while IFS= read -r -d "" svg_file; do \ + output_file="$${svg_file%.svg}.png"; \ + chapter_dir=$$(dirname $$(dirname "$$svg_file")); \ + svg_relative_path="$${svg_file#$$chapter_dir/}"; \ + output_relative_path="$${output_file#$$chapter_dir/}"; \ + inkscape --export-png="$$output_file" \ + --export-area-page \ + --export-dpi=150 \ + "$$svg_file"; \ + find "$$chapter_dir" -maxdepth 1 -type f -name "*.md" -print0 | xargs -0 sed -i "s|\\([/ \\t\\(]\\)$${svg_relative_path}|\\1$${output_relative_path}|g"; \ + done; \ + gitbook epub; \ + gitbook mobi; \ + gitbook pdf; \ + mv book.pdf book-A4.pdf; \ + mv book-A5.json book.json; \ + gitbook pdf; \ + mv book.pdf book-A5.pdf; \ + mv book-A4.pdf book.pdf' + +.PHONY: cp +cp: ## copy all exported e-book formats to current working directory ... + docker cp linux-insides-book:/srv/gitbook/book.epub "Linux Inside - 0xAX.epub" + docker cp linux-insides-book:/srv/gitbook/book.mobi "Linux Inside - 0xAX.mobi" + docker cp linux-insides-book:/srv/gitbook/book.pdf "Linux Inside - 0xAX.pdf" + docker cp linux-insides-book:/srv/gitbook/book-A5.pdf "Linux Inside - 0xAX (A5).pdf" + +.PHONY: clean +clean: ## remove all exported e-book files ... + rm "Linux Inside - 0xAX.epub" \ + "Linux Inside - 0xAX.mobi" \ + "Linux Inside - 0xAX.pdf" \ + "Linux Inside - 0xAX (A5).pdf" + +### LAUNCH BROWSER + +.PHONY: browse +browse: ## Launch broweser + @timeout 60 sh -c 'until nc -z 127.0.0.1 4000; do sleep 1; done' || true + @(uname | grep Darwin > /dev/null) && open http://127.0.0.1:4000 || true + @(uname | grep Linux > /dev/null) && xdg-open http://127.0.0.1:4000 || true diff --git a/Misc/images/dgap_menu.png b/Misc/images/dgap_menu.png index 27e23c5c..9eefa072 100644 Binary files a/Misc/images/dgap_menu.png and b/Misc/images/dgap_menu.png differ diff --git a/Misc/images/git_diff.png b/Misc/images/git_diff.png index 30cc493c..738a362e 100644 Binary files a/Misc/images/git_diff.png and b/Misc/images/git_diff.png differ diff --git a/Misc/images/github.png b/Misc/images/github.png index d2da8950..f0591c0d 100644 Binary files a/Misc/images/github.png and b/Misc/images/github.png differ diff --git a/Misc/images/google_linux.png b/Misc/images/google_linux.png index c9eeeab8..9123bde4 100644 Binary files a/Misc/images/google_linux.png and b/Misc/images/google_linux.png differ diff --git a/Misc/images/menuconfig.png b/Misc/images/menuconfig.png index 97542825..c86f25c8 100644 Binary files a/Misc/images/menuconfig.png and b/Misc/images/menuconfig.png differ diff --git a/Misc/images/nconfig.png b/Misc/images/nconfig.png index 5a142396..57908953 100644 Binary files a/Misc/images/nconfig.png and b/Misc/images/nconfig.png differ diff --git a/Misc/images/qemu.png b/Misc/images/qemu.png index 2599f78e..54218b6b 100644 Binary files a/Misc/images/qemu.png and b/Misc/images/qemu.png differ diff --git a/Misc/linux-misc-1.md b/Misc/linux-misc-1.md index ad417d0d..1cc1b57b 100644 --- a/Misc/linux-misc-1.md +++ b/Misc/linux-misc-1.md @@ -165,7 +165,7 @@ As result of compilation we can see the compressed kernel - `arch/x86/boot/bzIma Installing Linux kernel -------------------------------------------------------------------------------- -As I already wrote we will consider two ways how to launch new kernel: In the first case we can install and run the new version of the Linux kernel on the real hardware and the second is launch the Linux kernel on a virtual machine. In the previous paragraph we saw how to build the Linux kernel from source code and as a result we have got compressed image: +As I already wrote we will consider two ways to launch new kernel: in the first case we can install and run the new version of the Linux kernel on the real hardware and the second is launch the Linux kernel on a virtual machine. In the previous paragraph we saw how to build the Linux kernel from source code and as a result we have got compressed image: ``` ... @@ -224,7 +224,7 @@ $ make -j4 `busybox` is an executable file - `/bin/busybox` that contains a set of standard tools like [coreutils](https://en.wikipedia.org/wiki/GNU_Core_Utilities). In the `busybox` menu we need to enable: `Build BusyBox as a static binary (no shared libs)` option: -![busybox menu](http://i68.tinypic.com/11933bp.png) +![busysbox menu](https://i.imgur.com/TxPRCzQ.png) We can find this menu in the: @@ -270,7 +270,7 @@ $ find . -print0 | cpio --null -ov --format=newc | gzip -9 > ~/dev/initrd_x86_64 We can now run our kernel in the virtual machine. As I already wrote I prefer [qemu](https://en.wikipedia.org/wiki/QEMU) for this. We can run our kernel with the following command: ``` -$ qemu-system-x86_64 -snapshot -m 8GB -serial stdio -kernel ~/dev/linux/arch/x86_64/boot/bzImage -initrd ~/dev/initrd_x86_64.gz -append "root=/dev/sda1 ignore_loglevel" +$ qemu-system-x86_64 -snapshot -m 8G -serial stdio -kernel ~/dev/linux/arch/x86_64/boot/bzImage -initrd ~/dev/initrd_x86_64.gz -append "root=/dev/sda1 ignore_loglevel" ``` ![qemu](images/qemu.png) @@ -282,7 +282,7 @@ Consider using [ivandaviov/minimal](https://github.com/ivandavidov/minimal) or [ Getting started with the Linux Kernel Development --------------------------------------------------------------------------------- -The main point of this paragraph is to answer two questions: What to do and what not to do before sending your first patch to the Linux kernel. Please, do not confuse this `to do` with `todo`. I have no answer what you can fix in the Linux kernel. I just want to tell you my workflow during experimenting with the Linux kernel source code. +The main point of this paragraph is to answer two questions: what to do and what not to do before sending your first patch to the Linux kernel. Please, do not confuse this `to do` with `todo`. I have no answer what you can fix in the Linux kernel. I just want to tell you my workflow during experimenting with the Linux kernel source code. First of all I pull the latest updates from Linus's repo with the following commands: @@ -291,7 +291,7 @@ $ git checkout master $ git pull upstream master ``` -As soon as your local copy of the linux kernel source code is in sync with the [mainline](https://github.com/torvalds/linux) repository, we can start to apply changes to it. I already wrote, I have no advice for where you should start and what `TODO` to choose within the Linux kernel. But the best place for newbies is the `staging` tree. In other words the set of drivers from the [drivers/staging](https://github.com/torvalds/linux/tree/master/drivers/staging) directory. The maintainer of this tree is [Greg Kroah-Hartman](https://en.wikipedia.org/wiki/Greg_Kroah-Hartman) and the `staging` drivers are a good target for trivial patch fixes. Let's look at this simple example, that describes how to generate a patch, check it and send it to the [Linux kernel mail listing](https://lkml.org/). +As soon as your local copy of the Linux kernel source code is in sync with the [mainline](https://github.com/torvalds/linux) repository, we can start to apply changes to it. I already wrote, I have no advice for where you should start and what `TODO` to choose within the Linux kernel. But the best place for newbies is the `staging` tree. In other words the set of drivers from the [drivers/staging](https://github.com/torvalds/linux/tree/master/drivers/staging) directory. The maintainer of this tree is [Greg Kroah-Hartman](https://en.wikipedia.org/wiki/Greg_Kroah-Hartman) and the `staging` drivers are a good target for trivial patch fixes. Let's look at this simple example, that describes how to generate a patch, check it and send it to the [Linux kernel mail listing](https://lkml.org/). If we look in the driver for the [Digi International EPCA PCI](https://github.com/torvalds/linux/tree/master/drivers/staging/dgap) based devices, we will see the `dgap_sindex` function on line 295: diff --git a/Misc/linux-misc-2.md b/Misc/linux-misc-2.md index c03fb07c..a1669423 100644 --- a/Misc/linux-misc-2.md +++ b/Misc/linux-misc-2.md @@ -110,7 +110,7 @@ The `C` option tells the `makefile` that we need to check all `c` source code wi ifeq ($(KBUILD_SRC),) srctree := . endif - + objtree := . src := $(srctree) obj := $(objtree) @@ -250,7 +250,7 @@ all: vmlinux Don't worry that we have missed many lines in Makefile that are between `export RCS_FIND_IGNORE.....` and `all: vmlinux.....`. This part of the makefile is responsible for the `make *.config` targets and as I wrote in the beginning of this part we will see only building of the kernel in a general way. -The `all:` target is the default when no target is given on the command line. You can see here that we include architecture specific makefile there (in our case it will be [arch/x86/Makefile](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/Makefile)). From this moment we will continue from this makefile. As we can see `all` target depends on the `vmlinux` target that defined a little lower in the top makefile: +The `all:` target is the default when no target is given on the command line. You can see here that we include architecture specific makefile there (in our case it will be [arch/x86/Makefile](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/Makefile)). From this moment we will continue from this makefile. As we can see `all` target depends on the `vmlinux` target that is defined a little lower in the top makefile: ```Makefile vmlinux: scripts/link-vmlinux.sh $(vmlinux-deps) FORCE @@ -302,7 +302,7 @@ prepare1: prepare2 $(version_h) include/generated/utsrelease.h \ prepare2: prepare3 outputmakefile asm-generic ``` -The first `prepare0` expands to the `archprepare` that expands to the `archheaders` and `archscripts` that defined in the `x86_64` specific [Makefile](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/Makefile). Let's look on it. The `x86_64` specific makefile starts from the definition of the variables that are related to the architecture-specific configs ([defconfig](https://github.com/torvalds/linux/tree/master/arch/x86/configs), etc...). After this it defines flags for the compiling of the [16-bit](https://en.wikipedia.org/wiki/Real_mode) code, calculating of the `BITS` variable that can be `32` for `i386` or `64` for the `x86_64` flags for the assembly source code, flags for the linker and many many more (all definitions you can find in the [arch/x86/Makefile](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/Makefile)). The first target is `archheaders` in the makefile generates syscall table: +The first `prepare0` expands to the `archprepare` that expands to the `archheaders` and `archscripts` that defined in the `x86_64` specific [Makefile](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/Makefile). Let's look on it. The `x86_64` specific makefile starts from the definition of the variables that are related to the architecture-specific configs ([defconfig](https://github.com/torvalds/linux/tree/master/arch/x86/configs), etc...). After this it defines flags for the compiling of the [16-bit](https://en.wikipedia.org/wiki/Real_mode) code, calculating of the `BITS` variable that can be `32` for `i386` or `64` for the `x86_64` flags for the assembly source code, flags for the linker and many many more (all definitions you can find in the [arch/x86/Makefile](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/Makefile)). The first target is `archheaders` in the makefile and it generates syscall table: ```Makefile archheaders: @@ -425,7 +425,7 @@ $(vmlinux-dirs): prepare scripts $(Q)$(MAKE) $(build)=$@ ``` -The `$@` represents `vmlinux-dirs` here that means that it will go recursively over all directories from the `vmlinux-dirs` and its internal directories (depens on configuration) and will execute `make` in there. We can see it in the output: +The `$@` represents `vmlinux-dirs` here that means that it will go recursively over all directories from the `vmlinux-dirs` and its internal directories (depends on configuration) and will execute `make` in there. We can see it in the output: ``` CC init/main.o diff --git a/Misc/linux-misc-3.md b/Misc/linux-misc-3.md index 767005fd..aa897cd3 100644 --- a/Misc/linux-misc-3.md +++ b/Misc/linux-misc-3.md @@ -39,7 +39,7 @@ The `lib.c` file contains: ```C int factorial(int base) { int res,i = 1; - + if (base == 0) { return 1; } @@ -107,8 +107,8 @@ Disassembly of section .text: 20: b8 00 00 00 00 mov $0x0,%eax 25: e8 00 00 00 00 callq 2a 2a: b8 00 00 00 00 mov $0x0,%eax - 2f: c9 leaveq - 30: c3 retq + 2f: c9 leaveq + 30: c3 retq ``` Here we are interested only in the two `callq` operations. The two `callq` operations contain `linker stubs`, or the function name and offset from it to the next instruction. These stubs will be updated to the real addresses of the functions. We can see these functions' names within the following `objdump` output: @@ -169,7 +169,7 @@ factorial: file format elf64-x86-64 ... ``` -As we can see in the previous output, the address of the `main` function is `0x0000000000400506`. Why it does not start from `0x0`? You may already know that standard C programs are linked with the `glibc` C standard library (assuming the `-nostdlib` was not passed to the `gcc`). The compiled code for a program includes constructor functions to initialize data in the program when the program is started. These functions need to be called before the program is started, or in another words before the `main` function is called. To make the initialization and termination functions work, the compiler must output something in the assembler code to cause those functions to be called at the appropriate time. Execution of this program will start from the code placed in the special `.init` section. We can see this in the beginning of the objdump output: +As we can see in the previous output, the address of the `main` function is `0x0000000000400506`. Why doesn't it start from `0x0`? You may already know that standard C programs are linked with the `glibc` C standard library (assuming the `-nostdlib` was not passed to the `gcc`). The compiled code for a program includes constructor functions to initialize data in the program when the program is started. These functions need to be called before the program is started, or in another words before the `main` function is called. To make the initialization and termination functions work, the compiler must output something in the assembler code to cause those functions to be called at the appropriate time. Execution of this program will start from the code placed in the special `.init` section. We can see this in the beginning of the objdump output: ``` objdump -S factorial | less @@ -215,7 +215,7 @@ $ gcc main.c lib.o -o factorial and after it we will get executable file - `factorial` as a result: ``` -./factorial +./factorial factorial of 5 is: 120 ``` @@ -312,13 +312,13 @@ $ objdump -S /usr/lib/gcc/x86_64-linux-gnu/4.9/../../../x86_64-linux-gnu/crtn.o 0000000000000000 <.init>: 0: 48 83 c4 08 add $0x8,%rsp - 4: c3 retq + 4: c3 retq Disassembly of section .fini: 0000000000000000 <.fini>: 0: 48 83 c4 08 add $0x8,%rsp - 4: c3 retq + 4: c3 retq ``` And the `crti.o` object file contains the `_init` and `_fini` symbols. Let's try to link again with these two object files: @@ -344,14 +344,14 @@ $ ld \ Finally we get an executable file, but if we try to run it, we will get strange results: ``` -$ ./factorial +$ ./factorial bash: ./factorial: No such file or directory ``` What's the problem here? Let's look on the executable file with the [readelf](https://sourceware.org/binutils/docs/binutils/readelf.html) util: ``` -$ readelf -l factorial +$ readelf -l factorial Elf file type is EXEC (Executable file) Entry point 0x4003c0 @@ -378,13 +378,13 @@ Program Headers: Section to Segment mapping: Segment Sections... - 00 - 01 .interp - 02 .interp .note.ABI-tag .hash .dynsym .dynstr .gnu.version .gnu.version_r .rela.dyn .rela.plt .init .plt .text .fini .rodata .eh_frame - 03 .dynamic .got .got.plt .data - 04 .dynamic - 05 .note.ABI-tag - 06 + 00 + 01 .interp + 02 .interp .note.ABI-tag .hash .dynsym .dynstr .gnu.version .gnu.version_r .rela.dyn .rela.plt .init .plt .text .fini .rodata .eh_frame + 03 .dynamic .got .got.plt .data + 04 .dynamic + 05 .note.ABI-tag + 06 ``` Note on the strange line: @@ -429,7 +429,7 @@ and after this we link object files of our program with the needed system object Useful command line options of the GNU linker ---------------------------------------------- -As I already wrote and as you can see in the manual of the `GNU linker`, it has big set of the command line options. We've seen a couple of options in this post: `-o ` - that tells `ld` to produce an output file called `output` as the result of linking, `-l` that adds the archive or object file specified by the name, `-dynamic-linker` that specifies the name of the dynamic linker. Of course `ld` supports much more command line options, let's look at some of them. +As I already wrote and as you can see in the manual of the `GNU linker`, it has a big set of command line options. We've seen a couple of options in this post: `-o ` - that tells `ld` to produce an output file called `output` as the result of linking, `-l` that adds the archive or object file specified by the name, `-dynamic-linker` that specifies the name of the dynamic linker. Of course `ld` supports much more command line options, let's look at some of them. The first useful command line option is `@file`. In this case the `file` specifies filename where command line options will be read. For example we can create file with the name `linker.ld`, put there our command line arguments from the previous example and execute it with: @@ -445,7 +445,7 @@ The next command line option is `--defsym`. Full format of this command line opt LDFLAGS_vmlinux = --defsym _kernel_bss_size=$(KBSS_SZ) ``` -As we already know, it defines the `_kernel_bss_size` symbol with the size of the `.bss` section in the output file. This symbol will be used in the first [assembly file](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/arm/boot/compressed/head.S) that will be executed during kernel decompressing: +As we already know, it defines the `_kernel_bss_size` symbol with the size of the `.bss` section in the output file. This symbol will be used in the first [assembly file](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/arm/boot/compressed/head.S) that will be executed during kernel decompression: ```assembly ldr r5, =_kernel_bss_size @@ -474,7 +474,7 @@ $ ld -M @linker.ld 0x000000000040041b factorial ``` -Of course the `GNU linker` support standard command line options: `--help` and `--version` that print common help of the usage of the `ld` and its version. That's all about command line options of the `GNU linker`. Of course it is not the full set of command line options supported by the `ld` util. You can find the complete documentation of the `ld` util in the manual. +Of course the `GNU linker` supports standard command line options: `--help` and `--version` that prints common help of the usage of the `ld` and its version. That's all about command line options of the `GNU linker`. Of course it is not the full set of command line options supported by the `ld` util. You can find the complete documentation of the `ld` util in the manual. Control Language linker ---------------------------------------------- @@ -524,7 +524,7 @@ Our program consists from two sections: `.text` contains code of the program and /* * Linker script for the factorial */ -OUTPUT(hello) +OUTPUT(hello) OUTPUT_FORMAT("elf64-x86-64") INPUT(hello.o) @@ -607,14 +607,14 @@ SECTIONS } ``` -As you already may noted the syntax for expressions in the linker script language is identical to that of C expressions. Besides this the control language of the linking supports following builtin functions: +As you already may have noted, the syntax for expressions in the linker script language is identical to that of C expressions. Besides this the control language of the linking supports following builtin functions: * `ABSOLUTE` - returns absolute value of the given expression; * `ADDR` - takes the section and returns its address; * `ALIGN` - returns the value of the location counter (`.` operator) that aligned by the boundary of the next expression after the given expression; -* `DEFINED` - returns `1` if the given symbol placed in the global symbol table and `0` in other way; +* `DEFINED` - returns `1` if the given symbol placed in the global symbol table and `0` otherwise; * `MAX` and `MIN` - return maximum and minimum of the two given expressions; -* `NEXT` - returns the next unallocated address that is a multiple of the give expression; +* `NEXT` - returns the next unallocated address that is a multiple of the given expression; * `SIZEOF` - returns the size in bytes of the given named section. That's all. diff --git a/Misc/linux-misc-4.md b/Misc/linux-misc-4.md index 09359fae..3e9500a4 100644 --- a/Misc/linux-misc-4.md +++ b/Misc/linux-misc-4.md @@ -4,9 +4,9 @@ Program startup process in userspace Introduction -------------------------------------------------------------------------------- -Despite the [linux-insides](https://www.gitbook.com/book/0xax/linux-insides/details) described mostly Linux kernel related stuff, I have decided to write this one part which mostly related to userspace. +Despite the [linux-insides](https://www.gitbook.com/book/0xax/linux-insides/details) described mostly Linux kernel related stuff, I have decided to write this one part which mostly relates to userspace. -There is already fourth [part](https://0xax.gitbook.io/linux-insides/summary/syscall/linux-syscall-4) of [System calls](https://en.wikipedia.org/wiki/System_call) chapter which describes what does the Linux kernel do when we want to start a program. In this part I want to explore what happens when we run a program on a Linux machine from userspace perspective. +There is already fourth [part](https://0xax.gitbook.io/linux-insides/summary/syscall/linux-syscall-4) of [System calls](https://en.wikipedia.org/wiki/System_call) chapter which describes what the Linux kernel does when we want to start a program. In this part I want to explore what happens when we run a program on a Linux machine from userspace perspective. I don't know how about you, but in my university I learn that a `C` program starts executing from the function which is called `main`. And that's partly true. Whenever we are starting to write new program, we start our program from the following lines of code: @@ -75,7 +75,7 @@ Note on `Entry point: 0x400430` line. Now we know the actual address of entry po (gdb) break *0x400430 Breakpoint 1 at 0x400430 (gdb) run -Starting program: /home/alex/program +Starting program: /home/alex/program Breakpoint 1, 0x0000000000400430 in _start () ``` @@ -135,7 +135,7 @@ SYSCALL_DEFINE3(execve, } ``` -It takes an executable file name, set of command line arguments, and set of environment variables. As you may guess, everything is done by the `do_execve` function. I will not describe the implementation of the `do_execve` function in detail because you can read about this in [here](https://0xax.gitbook.io/linux-insides/summary/syscall/linux-syscall-4). But in short words, the `do_execve` function does many checks like `filename` is valid, limit of launched processes is not exceed in our system and etc. After all of these checks, this function parses our executable file which is represented in [ELF](https://en.wikipedia.org/wiki/Executable_and_Linkable_Format) format, creates memory descriptor for newly executed executable file and fills it with the appropriate values like area for the stack, heap and etc. When the setup of new binary image is done, the `start_thread` function will set up one new process. This function is architecture-specific and for the [x86_64](https://en.wikipedia.org/wiki/X86-64) architecture, its definition will be located in the [arch/x86/kernel/process_64.c](https://github.com/torvalds/linux/blob/08e4e0d0456d0ca8427b2d1ddffa30f1c3e774d7/arch/x86/kernel/process_64.c#L239) source code file. +It takes an executable file name, set of command line arguments, and set of environment variables. As you may guess, everything is done by the `do_execve` function. I will not describe the implementation of the `do_execve` function in detail because you can read about this in [here](https://0xax.gitbook.io/linux-insides/summary/syscall/linux-syscall-4). But in short words, the `do_execve` function does many checks like `filename` is valid, limit of launched processes is not exceeded in our system and etc. After all of these checks, this function parses our executable file which is represented in [ELF](https://en.wikipedia.org/wiki/Executable_and_Linkable_Format) format, creates memory descriptor for newly executed executable file and fills it with the appropriate values like area for the stack, heap and etc. When the setup of new binary image is done, the `start_thread` function will set up one new process. This function is architecture-specific and for the [x86_64](https://en.wikipedia.org/wiki/X86-64) architecture, its definition will be located in the [arch/x86/kernel/process_64.c](https://github.com/torvalds/linux/blob/08e4e0d0456d0ca8427b2d1ddffa30f1c3e774d7/arch/x86/kernel/process_64.c#L239) source code file. The `start_thread` function sets new value to [segment registers](https://en.wikipedia.org/wiki/X86_memory_segmentation) and program execution address. From this point, our new process is ready to start. Once the [context switch](https://en.wikipedia.org/wiki/Context_switch) will be done, control will be returned to userspace with new values of registers and the new executable will be started to execute. @@ -144,13 +144,13 @@ That's all from the kernel side. The Linux kernel prepares the binary image for How does a program start in userspace -------------------------------------------------------------------------------- -In the previous paragraph we saw how an executable file is prepared to run by the Linux kernel. Let's look at the same, but from userspace side. We already know that the entry point of each program is its `_start` function. But where is this function from? It may came from a library. But if you remember correctly we didn't link our program with any libraries during compilation of our program: +In the previous paragraph we saw how an executable file is prepared to run by the Linux kernel. Let's look at the same, but from userspace side. We already know that the entry point of each program is its `_start` function. But where is this function from? It may come from a library. But if you remember correctly we didn't link our program with any libraries during compilation of our program: ``` $ gcc -Wall program.c -o sum ``` -You may guess that `_start` comes from the [standard library](https://en.wikipedia.org/wiki/Standard_library) and that's true. If you try to compile our program again and pass the `-v` option to gcc which will enable `verbose mode`, you will see a long output. The full output is not interesting for us, let's look at the following steps: +You may guess that `_start` comes from the [standard library](https://en.wikipedia.org/wiki/Standard_library) and that's true. If you try to compile our program again and pass the `-v` option to gcc which will enable `verbose mode`, you will see a long output. The full output is not interesting for us, let's look at the following steps: First of all, our program should be compiled with `gcc`: @@ -217,10 +217,10 @@ $ gcc -nostdlib -lc -ggdb program.c -o program /usr/bin/ld: warning: cannot find entry symbol _start; defaulting to 0000000000400350 ``` -Ok, the compiler does not complain about undefined reference of standard library functions anymore as we linked our program with `/usr/lib64/libc.so.6`, but the `_start` symbol isn't resolved yet. Let's return to the verbose output of `gcc` and look at the parameters of `collect2`. The most important thing that we may see is that our program is linked not only with the standard library, but also with some object files. The first object file is: `/lib64/crt1.o`. And if we look inside this object file with `objdump`, we will see the `_start` symbol: +Ok, the compiler does not complain about undefined reference of standard library functions anymore as we linked our program with `/usr/lib64/libc.so.6`, but the `_start` symbol isn't resolved yet. Let's return to the verbose output of `gcc` and look at the parameters of `collect2`. The most important thing that we may see is that our program is linked not only with the standard library, but also with some object files. The first object file is: `/lib64/crt1.o`. And if we look inside this object file with `objdump`, we will see the `_start` symbol: ``` -$ objdump -d /lib64/crt1.o +$ objdump -d /lib64/crt1.o /lib64/crt1.o: file format elf64-x86-64 @@ -239,7 +239,7 @@ Disassembly of section .text: 16: 48 c7 c1 00 00 00 00 mov $0x0,%rcx 1d: 48 c7 c7 00 00 00 00 mov $0x0,%rdi 24: e8 00 00 00 00 callq 29 <_start+0x29> - 29: f4 hlt + 29: f4 hlt ``` As `crt1.o` is a shared object file, we see only stubs here instead of real calls. Let's look at the source code of the `_start` function. As this function is architecture specific, implementation for `_start` will be located in the [sysdeps/x86_64/start.S](https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/x86_64/start.S;h=f1b961f5ba2d6a1ebffee0005f43123c4352fbf4;hb=HEAD) assembly file. @@ -285,11 +285,11 @@ We can get all the arguments we need for `__libc_start_main` function from the s ``` +-----------------+ | NULL | -+-----------------+ ++-----------------+ | ... | | envp | | ... | -+-----------------+ ++-----------------+ | NULL | +------------------ | ... | @@ -297,7 +297,7 @@ We can get all the arguments we need for `__libc_start_main` function from the s | ... | +------------------ | argc | <- rsp -+-----------------+ ++-----------------+ ``` After we cleared `ebp` register and saved the address of the termination function in the `r9` register, we pop an element from the stack to the `rsi` register, so after this `rsp` will point to the `argv` array and `rsi` will contain count of command line arguments passed to the program: @@ -305,11 +305,11 @@ After we cleared `ebp` register and saved the address of the termination functio ``` +-----------------+ | NULL | -+-----------------+ ++-----------------+ | ... | | envp | | ... | -+-----------------+ ++-----------------+ | NULL | +------------------ | ... | @@ -337,7 +337,7 @@ mov $__libc_csu_init, %RCX_LP mov $main, %RDI_LP ``` -After stack aligning we push the address of the stack, move the addresses of contstructor and destructor to the `r8` and `rcx` registers and address of the `main` symbol to the `rdi`. From this moment we can call the `__libc_start_main` function from the [csu/libc-start.c](https://sourceware.org/git/?p=glibc.git;a=blob;f=csu/libc-start.c;h=0fb98f1606bab475ab5ba2d0fe08c64f83cce9df;hb=HEAD). +After stack aligning we push the address of the stack, move the addresses of constructor and destructor to the `r8` and `rcx` registers and address of the `main` symbol to the `rdi`. From this moment we can call the `__libc_start_main` function from the [csu/libc-start.c](https://sourceware.org/git/?p=glibc.git;a=blob;f=csu/libc-start.c;h=0fb98f1606bab475ab5ba2d0fe08c64f83cce9df;hb=HEAD). Before we look at the `__libc_start_main` function, let's add the `/lib64/crt1.o` and try to compile our program again: diff --git a/README.md b/README.md index f57963fe..ff4cd140 100644 --- a/README.md +++ b/README.md @@ -1,39 +1,27 @@ -linux-insides -=============== +# Linux insides -A book-in-progress about the linux kernel and its insides. +This repository contains a book-in-progress about the Linux kernel and its insides. -**The goal is simple** - to share my modest knowledge about the insides of the linux kernel and help people who are interested in linux kernel insides, and other low-level subject matter. Feel free to go through the book [Start here](https://github.com/0xAX/linux-insides/blob/master/SUMMARY.md) +The goal of this project is simple – to share knowledge about the Linux kernel internals and related low-level topics. If you’re curious about what’s under the hood, see the [Table of Contents](https://github.com/0xAX/linux-insides/blob/master/SUMMARY.md). -**Questions/Suggestions**: Feel free about any questions or suggestions by pinging me at twitter [@0xAX](https://twitter.com/0xAX), adding an [issue](https://github.com/0xAX/linux-insides/issues/new) or just drop me an [email](mailto:anotherworldofworld@gmail.com). +> [!IMPORTANT] +> I started writing this series when the latest version of the kernel was `3.18`. A lot has changed since then, and I am in progress of updating the content to reflect modern kernels (v6.16+). I’ll continue revising the posts as the kernel evolves. -# Mailing List +## Requirements -We have a Google Group mailing list for learning the kernel source code. Here are some instructions about how to use it. +- Prior knowledge about the [Assembly language](https://en.wikipedia.org/wiki/Assembly_language) +- Proficiency with the [C programming language](https://en.wikipedia.org/wiki/C_(programming_language)) +- Additionally, you can find lots of useful information about x86_64 processors in [Intel Software Developer Manuals](https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sdm.html) -#### Join +> [!TIP] +> You can get started with Assembler with my other series of posts about the [Assembly programming](https://github.com/0xAX/asm). -Send an email with any subject/content to `kernelhacking+subscribe@googlegroups.com`. Then you will receive a confirmation email. Reply it with any content and then you are done. +## Translations -> If you have Google account, you can also open the [archive page](https://groups.google.com/forum/#!forum/kernelhacking) and click **Apply to join group**. You will be approved automatically. +Thanks to the volunteers, the posts about Linux are translated into different languages. -#### Send emails to mailing list - -Just send emails to `kernelhacking@googlegroups.com`. The basic usage is the same as other mailing lists powered by mailman. - -#### Archives - -https://groups.google.com/forum/#!forum/kernelhacking - -Support -------- - -**Support** If you like `linux-insides` you can support me with: - -[![Support with bitcoin](https://img.shields.io/badge/donate-bitcoin-green.svg)](https://www.coinbase.com/checkouts/0bfa452a41cf52c0b3f99500b4f31685) [![Join the chat at https://gitter.im/0xAX/linux-insides](https://badges.gitter.im/0xAX/linux-insides.svg)](https://gitter.im/0xAX/linux-insides?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) - -On other languages -------------------- +> [!NOTE] +> The translations may diverge from the original content. * [Brazilian Portuguese](https://github.com/mauri870/linux-insides) * [Chinese](https://github.com/MintCN/linux-insides-zh) @@ -43,64 +31,27 @@ On other languages * [Spanish](https://github.com/leolas95/linux-insides) * [Turkish](https://github.com/ayyucedemirbas/linux-insides_Turkish) -Docker ------- - -In order to run your own copy of the book with gitbook within a local container: - -1. Enable Docker experimental features with vim or another text editor - ```bash - sudo vim /usr/lib/systemd/system/docker.service - ``` - - Then add --experimental=true to the end of the ExecStart=/usr/bin/dockerd -H fd:// line and save. - - Eg: *ExecStart=/usr/bin/dockerd -H fd:// --experimental=true* - - Then, you need to reload and restart the Docker daemon: - ```bash - systemctl daemon-reload - systemctl restart docker.service - ``` +## Contribution -2. Build container image - ```bash - docker image build \ - --rm --squash \ - --label linux-insides \ - --tag linux-insides-book:latest \ - -f Dockerfile . - ``` +Read the [Contribution guide](./CONTRIBUTING.md) to learn how to contribute to the project. When contributing, make sure to follow the [Code of Conduct](./CODE_OF_CONDUCT.md). -3. Create and run book in local container - ```bash - docker run \ - --detach \ - --rm \ - -p 4000:4000 \ - --name linux-insides-book \ - --hostname linux-insides-book \ - linux-insides-book - ``` +If you have any questions or suggestions, feel free to ping me at Twitter [@0xAX](https://twitter.com/0xAX), add an [issue](https://github.com/0xAX/linux-insides/issues/new), or drop me an [email](mailto:anotherworldofworld@gmail.com). -4. Open your local copy of linux insides book under this url - http://localhost:4000 +## Mailing list -Contributions --------------- +There is a Google group mailing list (`kernelhacking@googlegroups.com`) for learning the kernel source code. -Feel free to create issues or pull-requests if you have any problems. +To join the group, send an email to `kernelhacking+subscribe@googlegroups.com`. You will receive a confirmation email. After replying to it, you will be added to the mailing list. -**Please read [CONTRIBUTING.md](https://github.com/0xAX/linux-insides/blob/master/CONTRIBUTING.md) before pushing any changes.** +> [!TIP] +> If you have a Google account, you can simply open the [archive page](https://groups.google.com/forum/#!forum/kernelhacking) and click **Apply to join group**. You will be approved automatically. -![linux-kernel](Assets/linux-kernel.png) +## License -Author ---------------- +This project is licensed under the [BY-NC-SA Creative Commons](http://creativecommons.org/licenses/by-nc-sa/4.0/). -[@0xAX](https://twitter.com/0xAX) +## Author -LICENSE -------------- +The technical content is written by [@0xAX](https://x.com/0xAX). -Licensed [BY-NC-SA Creative Commons](http://creativecommons.org/licenses/by-nc-sa/4.0/). +Additional big thanks to [@klaudiagrz](https://github.com/klaudiagrz) for text improvements. diff --git a/SyncPrim/linux-sync-2.md b/SyncPrim/linux-sync-2.md index 8b6fedc7..53e5b06f 100644 --- a/SyncPrim/linux-sync-2.md +++ b/SyncPrim/linux-sync-2.md @@ -81,7 +81,7 @@ The topic of this part is `queued spinlocks`. This approach may help to solve bo The basic idea of the `MCS` lock is that a thread spins on a local variable and each processor in the system has its own copy of this variable (see the previous paragraph). In other words this concept is built on top of the [per-cpu](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-1) variables concept in the Linux kernel. -When the first thread wants to acquire a lock, it registers itself in the `queue`. In other words it will be added to the special `queue` and will acquire lock, because it is free for now. When the second thread wants to acquire the same lock before the first thread release it, this thread adds its own copy of the lock variable into this `queue`. In this case the first thread will contain a `next` field which will point to the second thread. From this moment, the second thread will wait until the first thread release its lock and notify `next` thread about this event. The first thread will be deleted from the `queue` and the second thread will be owner of a lock. +When the first thread wants to acquire a lock, it registers itself in the `queue`. In other words it will be added to the special `queue` and will acquire lock, because it is free for now. When the second thread wants to acquire the same lock before the first thread releases it, this thread adds its own copy of the lock variable into this `queue`. In this case the first thread will contain a `next` field which will point to the second thread. From this moment, the second thread will wait until the first thread releases its lock and notifies `next` thread about this event. The first thread will be deleted from the `queue` and the second thread will be owner of a lock. Schematically we can represent it like: @@ -175,7 +175,7 @@ All of these macros expand to the call of functions from the same header file. A typedef struct qspinlock { union { atomic_t val; - + struct { u8 locked; u8 pending; @@ -335,7 +335,7 @@ This array allows to make four attempts of a lock acquisition for the four event * software interrupt context; * non-maskable interrupt context. -Notice that we did not touch `queue` yet. We no need in it, because for two threads it just leads to unnecessary latency for memory access. In other case, the first thread may release it lock before this moment. In this case the `lock->val` will contain `_Q_LOCKED_VAL | _Q_PENDING_VAL` and we will start to build `queue`. We start to build `queue` by the getting the local copy of the `qnodes` array of the processor which executes thread and calculate `tail` which will indicate the tail of the `queue` and `idx` which represents an index of the `qnodes` array: +Notice that we did not touch `queue` yet. We do not need it, because for two threads it just leads to unnecessary latency for memory access. In other case, the first thread may release it lock before this moment. In this case the `lock->val` will contain `_Q_LOCKED_VAL | _Q_PENDING_VAL` and we will start to build `queue`. We start to build `queue` by the getting the local copy of the `qnodes` array of the processor which executes thread and calculate `tail` which will indicate the tail of the `queue` and `idx` which represents an index of the `qnodes` array: ```C queue: @@ -376,7 +376,7 @@ because we no need in it anymore as lock is acquired. If the `queued_spin_tryloc next = NULL; ``` -and retrieve previous tail. The next step is to check that `queue` is not empty. In this case we need to link previous entry with the new. While waitaing for the MCS lock, the next pointer may have been set by another lock waiter. We optimistically load the next pointer & prefetch the cacheline for writing to reduce latency in the upcoming MCS unlock operation: +and retrieve previous tail. The next step is to check that `queue` is not empty. In this case we need to link previous entry with the new. While waiting for the MCS lock, the next pointer may have been set by another lock waiter. We optimistically load the next pointer & prefetch the cacheline for writing to reduce latency in the upcoming MCS unlock operation: ```C if (old & _Q_TAIL_MASK) { @@ -384,7 +384,7 @@ and retrieve previous tail. The next step is to check that `queue` is not empty. WRITE_ONCE(prev->next, node); arch_mcs_spin_lock_contended(&node->locked); - + next = READ_ONCE(node->next); if (next) prefetchw(next); @@ -399,14 +399,14 @@ Yes, from this moment we are in the head of the `queue`. But before we are able val = atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK)); ``` -After both threads will release a lock, the head of the `queue` will hold a lock. In the end we just need to update the tail of the `queue` and remove current head from it. +After both threads will release a lock, the head of the `queue` will hold a lock. In the end we just need to update the tail of the `queue` and remove current head from it. That's all. Conclusion -------------------------------------------------------------------------------- -This is the end of the second part of the [synchronization primitives](https://en.wikipedia.org/wiki/Synchronization_%28computer_science%29) chapter in the Linux kernel. In the previous [part](https://0xax.gitbook.io/linux-insides/summary/syncprim/linux-sync-1) we already met the first synchronization primitive `spinlock` provided by the Linux kernel which is implemented as `ticket spinlock`. In this part we saw another implementation of the `spinlock` mechanism - `queued spinlock`. In the next part we will continue to dive into synchronization primitives in the Linux kernel. +This is the end of the second part of the [synchronization primitives](https://en.wikipedia.org/wiki/Synchronization_%28computer_science%29) chapter in the Linux kernel. In the previous [part](https://0xax.gitbook.io/linux-insides/summary/syncprim/linux-sync-1) we already met the first synchronization primitive `spinlock` provided by the Linux kernel which is implemented as `ticket spinlock`. In this part we saw another implementation of the `spinlock` mechanism - `queued spinlock`. In the next part we will continue to dive into synchronization primitives in the Linux kernel. If you have questions or suggestions, feel free to ping me in twitter [0xAX](https://twitter.com/0xAX), drop me [email](mailto:anotherworldofworld@gmail.com) or just create [issue](https://github.com/0xAX/linux-insides/issues/new). @@ -417,13 +417,13 @@ Links * [spinlock](https://en.wikipedia.org/wiki/Spinlock) * [interrupt](https://en.wikipedia.org/wiki/Interrupt) -* [interrupt handler](https://en.wikipedia.org/wiki/Interrupt_handler) +* [interrupt handler](https://en.wikipedia.org/wiki/Interrupt_handler) * [API](https://en.wikipedia.org/wiki/Application_programming_interface) * [Test and Set](https://en.wikipedia.org/wiki/Test-and-set) * [MCS](http://www.cs.rochester.edu/~scott/papers/1991_TOCS_synch.pdf) * [per-cpu variables](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-1) * [atomic instruction](https://en.wikipedia.org/wiki/Linearizability) -* [CMPXCHG instruction](http://x86.renejeschke.de/html/file_module_x86_id_41.html) +* [CMPXCHG instruction](http://x86.renejeschke.de/html/file_module_x86_id_41.html) * [LOCK instruction](http://x86.renejeschke.de/html/file_module_x86_id_159.html) * [NOP instruction](https://en.wikipedia.org/wiki/NOP) * [PREFETCHW instruction](http://www.felixcloutier.com/x86/PREFETCHW.html) diff --git a/SyncPrim/linux-sync-3.md b/SyncPrim/linux-sync-3.md index 050dbc9d..f89e678c 100644 --- a/SyncPrim/linux-sync-3.md +++ b/SyncPrim/linux-sync-3.md @@ -243,12 +243,12 @@ static inline int signal_pending_state(long state, struct task_struct *p) return 0; if (!signal_pending(p)) return 0; - + return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p); } ``` -We check that the `state` [bitmask](https://en.wikipedia.org/wiki/Mask_%28computing%29) contains `TASK_INTERRUPTIBLE` or `TASK_WAKEKILL` bits and if the bitmask does not contain this bit we exit. At the next step we check that the given task has a pending signal and exit if there is no. In the end we just check `TASK_INTERRUPTIBLE` bit in the `state` bitmask again or the [SIGKILL](https://en.wikipedia.org/wiki/Unix_signal#SIGKILL) signal. So, if our task has a pending signal, we will jump at the `interrupted` label: +We check that the `state` [bitmask](https://en.wikipedia.org/wiki/Mask_%28computing%29) contains `TASK_INTERRUPTIBLE` or `TASK_WAKEKILL` bits and if the bitmask does not contain this bit we exit. At the next step we check that the given task has a pending signal and exit if there is not. In the end we just check `TASK_INTERRUPTIBLE` bit in the `state` bitmask again or the [SIGKILL](https://en.wikipedia.org/wiki/Unix_signal#SIGKILL) signal. So, if our task has a pending signal, we will jump at the `interrupted` label: ```C interrupted: @@ -306,7 +306,7 @@ void up(struct semaphore *sem) EXPORT_SYMBOL(up); ``` -It looks almost the same as the `down` function. There are only two differences here. First of all we increment a counter of a `semaphore` if the list of waiters is empty. In other way we call the `__up` function from the same source code file. If the list of waiters is not empty we need to allow the first task from the list to acquire a lock: +It looks almost the same as the `down` function. There are only two differences here. First of all we increment a counter of a `semaphore` if the list of waiters is empty. In other way we call the `__up` function from the same source code file. If the list of waiters is not empty we need to allow the first task from the list to acquire a lock: ```C static noinline void __sched __up(struct semaphore *sem) @@ -326,7 +326,7 @@ That's all. Conclusion -------------------------------------------------------------------------------- -This is the end of the third part of the [synchronization primitives](https://en.wikipedia.org/wiki/Synchronization_%28computer_science%29) chapter in the Linux kernel. In the two previous parts we already met the first synchronization primitive `spinlock` provided by the Linux kernel which is implemented as `ticket spinlock` and used for a very short time locks. In this part we saw yet another synchronization primitive - [semaphore](https://en.wikipedia.org/wiki/Semaphore_%28programming%29) which is used for long time locks as it leads to [context switch](https://en.wikipedia.org/wiki/Context_switch). In the next part we will continue to dive into synchronization primitives in the Linux kernel and will see next synchronization primitive - [mutex](https://en.wikipedia.org/wiki/Mutual_exclusion). +This is the end of the third part of the [synchronization primitives](https://en.wikipedia.org/wiki/Synchronization_%28computer_science%29) chapter in the Linux kernel. In the two previous parts we already met the first synchronization primitive `spinlock` provided by the Linux kernel which is implemented as `ticket spinlock` and used for a very short time locks. In this part we saw yet another synchronization primitive - [semaphore](https://en.wikipedia.org/wiki/Semaphore_%28programming%29) which is used for long time locks as it leads to [context switch](https://en.wikipedia.org/wiki/Context_switch). In the next part we will continue to dive into synchronization primitives in the Linux kernel and will see next synchronization primitive - [mutex](https://en.wikipedia.org/wiki/Mutual_exclusion). If you have questions or suggestions, feel free to ping me in twitter [0xAX](https://twitter.com/0xAX), drop me [email](mailto:anotherworldofworld@gmail.com) or just create [issue](https://github.com/0xAX/linux-insides/issues/new). diff --git a/SyncPrim/linux-sync-4.md b/SyncPrim/linux-sync-4.md index 2e9d674e..f97c5f32 100644 --- a/SyncPrim/linux-sync-4.md +++ b/SyncPrim/linux-sync-4.md @@ -23,7 +23,7 @@ struct semaphore { }; ``` -structure which holds information about state of a [lock](https://en.wikipedia.org/wiki/Lock_%28computer_science%29) and list of a lock waiters. Depends on the value of the `count` field, a `semaphore` can provide access to a resource of more than one wishing of this resource. The [mutex](https://en.wikipedia.org/wiki/Mutual_exclusion) concept is very similar to a [semaphore](https://en.wikipedia.org/wiki/Semaphore_%28programming%29) concept. But it has some differences. The main difference between `semaphore` and `mutex` synchronization primitive is that `mutex` has more strict semantic. Unlike a `semaphore`, only one [process](https://en.wikipedia.org/wiki/Process_%28computing%29) may hold `mutex` at one time and only the `owner` of a `mutex` may release or unlock it. Additional difference in implementation of `lock` [API](https://en.wikipedia.org/wiki/Application_programming_interface). The `semaphore` synchronization primitive forces rescheduling of processes which are in waiters list. The implementation of `mutex` lock `API` allows to avoid this situation and as a result expensive [context switches](https://en.wikipedia.org/wiki/Context_switch). +structure which holds information about state of a [lock](https://en.wikipedia.org/wiki/Lock_%28computer_science%29) and list of a lock waiters. Depending on the value of the `count` field, a `semaphore` can provide access to a resource to more than one processes wishing to access this resource. The [mutex](https://en.wikipedia.org/wiki/Mutual_exclusion) concept is very similar to a [semaphore](https://en.wikipedia.org/wiki/Semaphore_%28programming%29) concept. But it has some differences. The main difference between `semaphore` and `mutex` synchronization primitive is that `mutex` has more strict semantic. Unlike a `semaphore`, only one [process](https://en.wikipedia.org/wiki/Process_%28computing%29) may hold `mutex` at one time and only the `owner` of a `mutex` may release or unlock it. Additional difference in implementation of `lock` [API](https://en.wikipedia.org/wiki/Application_programming_interface). The `semaphore` synchronization primitive forces rescheduling of processes which are in waiters list. The implementation of `mutex` lock `API` allows to avoid this situation and has expensive [context switches](https://en.wikipedia.org/wiki/Context_switch). The `mutex` synchronization primitive represented by the following: @@ -47,13 +47,13 @@ struct mutex { }; ``` -structure in the Linux kernel. This structure is defined in the [include/linux/mutex.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/mutex.h) header file and contains similar to the `semaphore` structure set of fields. The first field of the `mutex` structure is - `count`. Value of this field represents state of a `mutex`. In a case when the value of the `count` field is `1`, a `mutex` is in `unlocked` state. When the value of the `count` field is `zero`, a `mutex` is in the `locked` state. Additionally value of the `count` field may be `negative`. In this case a `mutex` is in the `locked` state and has possible waiters. +structure in the Linux kernel. This structure is defined in the [include/linux/mutex.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/mutex.h) header file and contains a set of fields similar to the `semaphore` structure. The first field of the `mutex` structure is - `count`. Value of this field represents state of a `mutex`. In a case when the value of the `count` field is `1`, a `mutex` is in `unlocked` state. When the value of the `count` field is `zero`, a `mutex` is in the `locked` state. Additionally value of the `count` field may be `negative`. In this case a `mutex` is in the `locked` state and has possible waiters. The next two fields of the `mutex` structure - `wait_lock` and `wait_list` are [spinlock](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/mutex.h) for the protection of a `wait queue` and list of waiters which represents this `wait queue` for a certain lock. As you may notice, the similarity of the `mutex` and `semaphore` structures ends. Remaining fields of the `mutex` structure, as we may see depends on different configuration options of the Linux kernel. The first field - `owner` represents [process](https://en.wikipedia.org/wiki/Process_%28computing%29) which acquired a lock. As we may see, existence of this field in the `mutex` structure depends on the `CONFIG_DEBUG_MUTEXES` or `CONFIG_MUTEX_SPIN_ON_OWNER` kernel configuration options. Main point of this field and the next `osq` fields is support of `optimistic spinning` which we will see later. The last two fields - `magic` and `dep_map` are used only in [debugging](https://en.wikipedia.org/wiki/Debugging) mode. The `magic` field is to storing a `mutex` related information for debugging and the second field - `lockdep_map` is for [lock validator](https://www.kernel.org/doc/Documentation/locking/lockdep-design.txt) of the Linux kernel. -Now, after we have considered the `mutex` structure, we may consider how this synchronization primitive works in the Linux kernel. As you may guess, a process which wants to acquire a lock, must to decrease value of the `mutex->count` if possible. And if a process wants to release a lock, it must to increase the same value. That's true. But as you may also guess, it is not so simple in the Linux kernel. +Now, after we have considered the `mutex` structure, we may consider how this synchronization primitive works in the Linux kernel. As you may guess, a process who wants to acquire a lock, must to decrease value of the `mutex->count` if possible. And if a process wants to release a lock, it must to increase the same value. That's true. But as you may also guess, it is not so simple in the Linux kernel. Actually, when a process try to acquire a `mutex`, there three possible paths: @@ -63,7 +63,7 @@ Actually, when a process try to acquire a `mutex`, there three possible paths: which may be taken, depending on the current state of the `mutex`. The first path or `fastpath` is the fastest as you may understand from its name. Everything is easy in this case. Nobody acquired a `mutex`, so the value of the `count` field of the `mutex` structure may be directly decremented. In a case of unlocking of a `mutex`, the algorithm is the same. A process just increments the value of the `count` field of the `mutex` structure. Of course, all of these operations must be [atomic](https://en.wikipedia.org/wiki/Linearizability). -Yes, this looks pretty easy. But what happens if a process wants to acquire a `mutex` which is already acquired by other process? In this case, the control will be transferred to the second path - `midpath`. The `midpath` or `optimistic spinning` tries to [spin](https://en.wikipedia.org/wiki/Spinlock) with already familiar for us [MCS lock](http://www.cs.rochester.edu/~scott/papers/1991_TOCS_synch.pdf) while the lock owner is running. This path will be executed only if there are no other processes ready to run that have higher priority. This path is called `optimistic` because the waiting task will not be sleep and rescheduled. This allows to avoid expensive [context switch](https://en.wikipedia.org/wiki/Context_switch). +Yes, this looks pretty easy. But what happens if a process wants to acquire a `mutex` which is already acquired by other process? In this case, the control will be transferred to the second path - `midpath`. The `midpath` or `optimistic spinning` tries to [spin](https://en.wikipedia.org/wiki/Spinlock) with already familiar for us [MCS lock](http://www.cs.rochester.edu/~scott/papers/1991_TOCS_synch.pdf) while the lock owner is running. This path will be executed only if there are no other processes ready to run that have higher priority. This path is called `optimistic` because the waiting task will not sleep and be rescheduled. This allows to avoid expensive [context switch](https://en.wikipedia.org/wiki/Context_switch). In the last case, when the `fastpath` and `midpath` may not be executed, the last path - `slowpath` will be executed. This path acts like a [semaphore](https://en.wikipedia.org/wiki/Semaphore_%28programming%29) lock. If the lock is unable to be acquired by a process, this process will be added to `wait queue` which is represented by the following: @@ -77,7 +77,7 @@ struct mutex_waiter { }; ``` -structure from the [include/linux/mutex.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/mutex.h) header file and will be sleep. Before we will consider [API](https://en.wikipedia.org/wiki/Application_programming_interface) which is provided by the Linux kernel for manipulation with `mutexes`, let's consider the `mutex_waiter` structure. If you have read the [previous part](https://0xax.gitbook.io/linux-insides/summary/syncprim/linux-sync-3) of this chapter, you may notice that the `mutex_waiter` structure is similar to the `semaphore_waiter` structure from the [kernel/locking/semaphore.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/locking/semaphore.c) source code file: +structure from the [include/linux/mutex.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/mutex.h) header file and will sleep. Before we will consider [API](https://en.wikipedia.org/wiki/Application_programming_interface) which is provided by the Linux kernel for manipulation of `mutexes`, let's consider the `mutex_waiter` structure. If you have read the [previous part](https://0xax.gitbook.io/linux-insides/summary/syncprim/linux-sync-3) of this chapter, you may notice that the `mutex_waiter` structure is similar to the `semaphore_waiter` structure from the [kernel/locking/semaphore.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/locking/semaphore.c) source code file: ```C struct semaphore_waiter { @@ -87,16 +87,16 @@ struct semaphore_waiter { }; ``` -It also contains `list` and `task` fields which are represent entry of the mutex wait queue. The one difference here that the `mutex_waiter` does not contains `up` field, but contains the `magic` field which depends on the `CONFIG_DEBUG_MUTEXES` kernel configuration option and used to store a `mutex` related information for debugging purpose. +It also contains `list` and `task` fields which represent entry of the mutex wait queue. The one difference here that the `mutex_waiter` does not contains `up` field, but contains the `magic` field which depends on the `CONFIG_DEBUG_MUTEXES` kernel configuration option and used to store a `mutex` related information for debugging purpose. -Now we know what is it `mutex` and how it is represented the Linux kernel. In this case, we may go ahead and start to look at the [API](https://en.wikipedia.org/wiki/Application_programming_interface) which the Linux kernel provides for manipulation of `mutexes`. +Now we know what is a `mutex` and how it is represented the Linux kernel. In this case, we may go ahead and start to look at the [API](https://en.wikipedia.org/wiki/Application_programming_interface) which the Linux kernel provides for manipulation of `mutexes`. Mutex API -------------------------------------------------------------------------------- -Ok, in the previous paragraph we knew what is it `mutex` synchronization primitive and saw the `mutex` structure which represents `mutex` in the Linux kernel. Now it's time to consider [API](https://en.wikipedia.org/wiki/Application_programming_interface) for manipulation of mutexes. Description of the `mutex` API is located in the [include/linux/mutex.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/mutex.h) header file. As always, before we will consider how to acquire and release a `mutex`, we need to know how to initialize it. +Ok, in the previous paragraph we knew what is a `mutex` synchronization primitive and saw the `mutex` structure which represents `mutex` in the Linux kernel. Now it's time to consider [API](https://en.wikipedia.org/wiki/Application_programming_interface) for manipulation of mutexes. Description of the `mutex` API is located in the [include/linux/mutex.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/mutex.h) header file. As always, before we will consider how to acquire and release a `mutex`, we need to know how to initialize it. -There are two approaches to initialize a `mutex`. The first is to do it statically. For this purpose the Linux kernel provides following: +There are two approaches to initializing a `mutex`. The first is to do it statically. For this purpose the Linux kernel provides following: ```C #define DEFINE_MUTEX(mutexname) \ @@ -114,9 +114,9 @@ macro. Let's consider implementation of this macro. As we may see, the `DEFINE_M } ``` -This macro is defined in the [same](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/mutex.h) header file and as we may understand it initializes fields of the `mutex` structure the initial values. The `count` field get initialized with the `1` which represents `unlocked` state of a mutex. The `wait_lock` [spinlock](https://en.wikipedia.org/wiki/Spinlock) get initialized to the unlocked state and the last field `wait_list` to empty [doubly linked list](https://0xax.gitbook.io/linux-insides/summary/datastructures/linux-datastructures-1). +This macro is defined in the [same](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/mutex.h) header file and as we may understand it initializes fields of the `mutex` structure to their initial values. The `count` field get initialized with the `1` which represents `unlocked` state of a mutex. The `wait_lock` [spinlock](https://en.wikipedia.org/wiki/Spinlock) get initialized to the unlocked state and the last field `wait_list` to empty [doubly linked list](https://0xax.gitbook.io/linux-insides/summary/datastructures/linux-datastructures-1). -The second approach allows us to initialize a `mutex` dynamically. To do this we need to call the `__mutex_init` function from the [kernel/locking/mutex.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/locking/mutex.c) source code file. Actually, the `__mutex_init` function rarely called directly. Instead of the `__mutex_init`, the: +The second approach allows us to initialize a `mutex` dynamically. To do this we need to call the `__mutex_init` function from the [kernel/locking/mutex.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/locking/mutex.c) source code file. Actually, the `__mutex_init` function is rarely called directly. Instead of the `__mutex_init`, the: ```C # define mutex_init(mutex) \ @@ -150,7 +150,7 @@ As we may see the `__mutex_init` function takes three arguments: * `name` - name of mutex for debugging purpose; * `key` - key for [lock validator](https://www.kernel.org/doc/Documentation/locking/lockdep-design.txt). -At the beginning of the `__mutex_init` function, we may see initialization of the `mutex` state. We set it to `unlocked` state with the `atomic_set` function which atomically set the give variable to the given value. After this we may see initialization of the `spinlock` to the unlocked state which will protect `wait queue` of the `mutex` and initialization of the `wait queue` of the `mutex`. After this we clear owner of the `lock` and initialize optimistic queue by the call of the `osq_lock_init` function from the [include/linux/osq_lock.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/osq_lock.h) header file. This function just sets the tail of the optimistic queue to the unlocked state: +At the beginning of the `__mutex_init` function, we may see initialization of the `mutex` state. We set it to `unlocked` state with the `atomic_set` function which atomically sets the variable to the given value. After this we may see initialization of the `spinlock` to the unlocked state which will protect `wait queue` of the `mutex` and initialization of the `wait queue` of the `mutex`. After this we clear owner of the `lock` and initialize optimistic queue by the call of the `osq_lock_init` function from the [include/linux/osq_lock.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/osq_lock.h) header file. This function just sets the tail of the optimistic queue to the unlocked state: ```C static inline bool osq_is_locked(struct optimistic_spin_queue *lock) @@ -161,7 +161,7 @@ static inline bool osq_is_locked(struct optimistic_spin_queue *lock) In the end of the `__mutex_init` function we may see the call of the `debug_mutex_init` function, but as I already wrote in previous parts of this [chapter](https://0xax.gitbook.io/linux-insides/summary/syncprim), we will not consider debugging related stuff in this chapter. -After the `mutex` structure is initialized, we may go ahead and will look at the `lock` and `unlock` API of `mutex` synchronization primitive. Implementation of `mutex_lock` and `mutex_unlock` functions located in the [kernel/locking/mutex.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/locking/mutex.c) source code file. First of all let's start from the implementation of the `mutex_lock`. It looks: +After the `mutex` structure is initialized, we may go ahead and will look at the `lock` and `unlock` API of `mutex` synchronization primitive. Implementation of `mutex_lock` and `mutex_unlock` functions is located in the [kernel/locking/mutex.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/locking/mutex.c) source code file. First of all let's start from the implementation of the `mutex_lock`. It looks: ```C void __sched mutex_lock(struct mutex *lock) @@ -176,7 +176,7 @@ We may see the call of the `might_sleep` macro from the [include/linux/kernel.h] After the `might_sleep` macro, we may see the call of the `__mutex_fastpath_lock` function. This function is architecture-specific and as we consider [x86_64](https://en.wikipedia.org/wiki/X86-64) architecture in this book, the implementation of the `__mutex_fastpath_lock` is located in the [arch/x86/include/asm/mutex_64.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/mutex_64.h) header file. As we may understand from the name of the `__mutex_fastpath_lock` function, this function will try to acquire lock in a fast path or in other words this function will try to decrement the value of the `count` of the given mutex. -Implementation of the `__mutex_fastpath_lock` function consists from two parts. The first part is [inline assembly](https://0xax.gitbook.io/linux-insides/summary/theory/linux-theory-3) statement. Let's look at it: +Implementation of the `__mutex_fastpath_lock` function consists of two parts. The first part is [inline assembly](https://0xax.gitbook.io/linux-insides/summary/theory/linux-theory-3) statement. Let's look at it: ```C asm_volatile_goto(LOCK_PREFIX " decl %0\n" @@ -205,13 +205,13 @@ exit: return; ``` -For this moment he implementation of the `__mutex_fastpath_lock` function looks pretty easy. But the value of the `mutex->counter` may be negative after increment. In this case the: +For this moment the implementation of the `__mutex_fastpath_lock` function looks pretty easy. But the value of the `mutex->counter` may be negative after decrement. In this case the: ```C fail_fn(v); ``` -will be called after our inline assembly statement. The `fail_fn` is the second parameter of the `__mutex_fastpath_lock` function and represents pointer to function which represents `midpath/slowpath` paths to acquire the given lock. In our case the `fail_fn` is the `__mutex_lock_slowpath` function. Before we will look at the implementation of the `__mutex_lock_slowpath` function, let's finish with the implementation of the `mutex_lock` function. In the simplest way, the lock will be acquired successfully by a process and the `__mutex_fastpath_lock` will be finished. In this case, we just call the +will be called after our inline assembly statement. The `fail_fn` is the second parameter of the `__mutex_fastpath_lock` function and represents pointer to function which represents `midpath/slowpath` paths to acquire the given lock. In our case the `fail_fn` is the `__mutex_lock_slowpath` function. Before we look at the implementation of the `__mutex_lock_slowpath` function, let's finish with the implementation of the `mutex_lock` function. In the simplest way, the lock will be acquired successfully by a process and the `__mutex_fastpath_lock` will be finished. In this case, we just call the ```C mutex_set_owner(lock); @@ -254,7 +254,7 @@ if (mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx)) { } ``` -First of all the `mutex_optimistic_spin` function check that we don't need to reschedule or in other words there are no other tasks ready to run that have higher priority. If this check was successful we need to update `MCS` lock wait queue with the current spin. In this way only one spinner can complete for the mutex at one time: +First of all, `mutex_optimistic_spin` checks that we don't need to reschedule or in other words there are no other tasks ready to run that have higher priority. If this check was successful we need to update `MCS` lock wait queue with the current spin. In this way only one spinner can complete for the mutex at one time: ```C osq_lock(&lock->osq) @@ -279,7 +279,7 @@ while (true) { } ``` -and try to acquire a lock. First of all we try to take current owner and if the owner exists (it may not exists in a case when a process already released a mutex) and we wait for it in the `mutex_spin_on_owner` function before the owner will release a lock. If new task with higher priority have appeared during wait of the lock owner, we break the loop and go to sleep. In other case, the process already may release a lock, so we try to acquire a lock with the `mutex_try_to_acquired`. If this operation finished successfully, we set new owner for the given mutex, removes ourself from the `MCS` wait queue and exit from the `mutex_optimistic_spin` function. At this state a lock will be acquired by a process and we enable [preemption](https://en.wikipedia.org/wiki/Preemption_%28computing%29) and exit from the `__mutex_lock_common` function: +and try to acquire a lock. First of all we try to take current owner and if the owner exists (it may not exist in a case when a process already released a mutex) and we wait for it in the `mutex_spin_on_owner` function before the owner will release a lock. If new task with higher priority have appeared during wait of the lock owner, we break the loop and go to sleep. In other case, the process already may release a lock, so we try to acquire a lock with the `mutex_try_to_acquired`. If this operation finished successfully, we set new owner for the given mutex, removes ourself from the `MCS` wait queue and exit from the `mutex_optimistic_spin` function. At this stage, a lock will be acquired by a process and we enable [preemption](https://en.wikipedia.org/wiki/Preemption_%28computing%29) and exit from the `__mutex_lock_common` function: ```C if (mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx)) { @@ -303,7 +303,7 @@ static bool mutex_optimistic_spin(struct mutex *lock, #endif ``` -In all of these cases, the `__mutex_lock_common` function will acct like a `semaphore`. We try to acquire a lock again because the owner of a lock might already release a lock before this time: +In all of these cases, the `__mutex_lock_common` function will act like a `semaphore`. We try to acquire a lock again because the owner of a lock might already release a lock before this time: ```C if (!mutex_is_locked(lock) && @@ -324,7 +324,7 @@ In a successful case we update the owner of a lock, enable preemption and exit f skip_wait: mutex_set_owner(lock); preempt_enable(); - return 0; + return 0; ``` In this case a lock will be acquired. If can't acquire a lock for now, we enter into the following loop: @@ -338,7 +338,7 @@ for (;;) { if (unlikely(signal_pending_state(state, task))) { ret = -EINTR; goto err; - } + } __set_task_state(task, state); @@ -348,7 +348,7 @@ for (;;) { where try to acquire a lock again and exit if this operation was successful. Yes, we try to acquire a lock again right after unsuccessful try before the loop. We need to do it to make sure that we get a wakeup once a lock will be unlocked. Besides this, it allows us to acquire a lock after sleep. In other case we check the current process for pending [signals](https://en.wikipedia.org/wiki/Unix_signal) and exit if the process was interrupted by a `signal` during wait for a lock acquisition. In the end of loop we didn't acquire a lock, so we set the task state for `TASK_UNINTERRUPTIBLE` and go to sleep with call of the `schedule_preempt_disabled` function. -That's all. We have considered all three possible paths through which a process may pass when it will want to acquire a lock. Now let's consider how `mutex_unlock` is implemented. When the `mutex_unlock` will be called by a process which wants to release a lock, the `__mutex_fastpath_unlock` will be called from the [arch/x86/include/asm/mutex_64.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/mutex_64.h) header file: +That's all. We have considered all three possible paths through which a process may pass when it will want to acquire a lock. Now let's consider how `mutex_unlock` is implemented. When the `mutex_unlock` is called by a process which wants to release a lock, the `__mutex_fastpath_unlock` will be called from the [arch/x86/include/asm/mutex_64.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/mutex_64.h) header file: ```C void __sched mutex_unlock(struct mutex *lock) @@ -385,12 +385,12 @@ __mutex_unlock_slowpath(atomic_t *lock_count) } ``` -In the `__mutex_unlock_common_slowpath` function we will get the first entry from the wait queue if the wait queue is not empty and wakeup related process: +In the `__mutex_unlock_common_slowpath` function we will get the first entry from the wait queue if the wait queue is not empty and wake up related process: ```C if (!list_empty(&lock->wait_list)) { struct mutex_waiter *waiter = - list_entry(lock->wait_list.next, struct mutex_waiter, list); + list_entry(lock->wait_list.next, struct mutex_waiter, list); wake_up_process(waiter->task); } ``` @@ -423,7 +423,7 @@ Links * [Spinlock](https://en.wikipedia.org/wiki/Spinlock) * [Semaphore](https://en.wikipedia.org/wiki/Semaphore_%28programming%29) * [Synchronization primitives](https://en.wikipedia.org/wiki/Synchronization_%28computer_science%29) -* [API](https://en.wikipedia.org/wiki/Application_programming_interface) +* [API](https://en.wikipedia.org/wiki/Application_programming_interface) * [Locking mechanism](https://en.wikipedia.org/wiki/Lock_%28computer_science%29) * [Context switches](https://en.wikipedia.org/wiki/Context_switch) * [lock validator](https://www.kernel.org/doc/Documentation/locking/lockdep-design.txt) diff --git a/SyncPrim/linux-sync-5.md b/SyncPrim/linux-sync-5.md index 2daee1a4..711222a8 100644 --- a/SyncPrim/linux-sync-5.md +++ b/SyncPrim/linux-sync-5.md @@ -13,7 +13,7 @@ So, let's start. Reader/Writer semaphore -------------------------------------------------------------------------------- -Actually there are two types of operations may be performed on the data. We may read data and make changes in data. Two fundamental operations - `read` and `write`. Usually (but not always), `read` operation is performed more often than `write` operation. In this case, it would be logical to we may lock data in such way, that some processes may read locked data in one time, on condition that no one will not change the data. The [readers/writer lock](https://en.wikipedia.org/wiki/Readers%E2%80%93writer_lock) allows us to get this lock. +Actually there are two types of operations may be performed on the data. We may read data and make changes in data. Two fundamental operations - `read` and `write`. Usually (but not always), `read` operation is performed more often than `write` operation. In this case, it would be logical to lock data in such way, that some processes may read locked data in one time, on condition that no one will not change the data. The [readers/writer lock](https://en.wikipedia.org/wiki/Readers%E2%80%93writer_lock) allows us to get this lock. When a process which wants to write something into data, all other `writer` and `reader` processes will be blocked until the process which acquired a lock, will not release it. When a process reads data, other processes which want to read the same data too, will not be locked and will be able to do this. As you may guess, implementation of the `reader/writer semaphore` is based on the implementation of the `normal semaphore`. We already familiar with the [semaphore](https://en.wikipedia.org/wiki/Semaphore_%28programming%29) synchronization primitive from the third [part](https://0xax.gitbook.io/linux-insides/summary/syncprim/linux-sync-4) of this chapter. From the theoretical side everything looks pretty simple. Let's look how `reader/writer semaphore` is represented in the Linux kernel. @@ -81,7 +81,7 @@ Reader/Writer semaphore API So, we know a little about `reader/writer semaphores` from theoretical side, let's look on its implementation in the Linux kernel. All `reader/writer semaphores` related [API](https://en.wikipedia.org/wiki/Application_programming_interface) is located in the [include/linux/rwsem.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/rwsem.h) header file. -As always Before we will consider an [API](https://en.wikipedia.org/wiki/Application_programming_interface) of the `reader/writer semaphore` mechanism in the Linux kernel, we need to know how to initialize the `rw_semaphore` structure. As we already saw in previous parts of this [chapter](https://0xax.gitbook.io/linux-insides/summary/syncprim), all [synchronization primitives](https://en.wikipedia.org/wiki/Synchronization_%28computer_science%29) may be initialized in two ways: +As always, before we consider an [API](https://en.wikipedia.org/wiki/Application_programming_interface) of the `reader/writer semaphore` mechanism in the Linux kernel, we need to know how to initialize the `rw_semaphore` structure. As we already saw in previous parts of this [chapter](https://0xax.gitbook.io/linux-insides/summary/syncprim), all [synchronization primitives](https://en.wikipedia.org/wiki/Synchronization_%28computer_science%29) may be initialized in two ways: * `statically`; * `dynamically`. @@ -103,7 +103,7 @@ As we may see, the `DECLARE_RWSEM` macro just expands to the definition of the ` .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock) \ __RWSEM_OPT_INIT(name) \ __RWSEM_DEP_MAP_INIT(name) -} +} ``` and expands to the initialization of fields of `rw_semaphore` structure. First of all we initialize `count` field of the `rw_semaphore` structure to the `unlocked` state with `RWSEM_UNLOCKED_VALUE` macro from the [arch/x86/include/asm/rwsem.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/rwsem.h) architecture specific header file: @@ -193,7 +193,7 @@ void __sched down_write(struct rw_semaphore *sem) } ``` -We already met the `might_sleep` macro in the [previous part](https://0xax.gitbook.io/linux-insides/summary/syncprim/linux-sync-4). In short words, Implementation of the `might_sleep` macro depends on the `CONFIG_DEBUG_ATOMIC_SLEEP` kernel configuration option and if this option is enabled, this macro just prints a stack trace if it was executed in [atomic](https://en.wikipedia.org/wiki/Linearizability) context. As this macro is mostly for debugging purpose we will skip it and will go ahead. Additionally we will skip the next macro from the `down_read` function - `rwsem_acquire` which is related to the [lock validator](https://www.kernel.org/doc/Documentation/locking/lockdep-design.txt) of the Linux kernel, because this is topic of other part. +We already met the `might_sleep` macro in the [previous part](https://0xax.gitbook.io/linux-insides/summary/syncprim/linux-sync-4). In short, implementation of the `might_sleep` macro depends on the `CONFIG_DEBUG_ATOMIC_SLEEP` kernel configuration option and if this option is enabled, this macro just prints a stack trace if it was executed in [atomic](https://en.wikipedia.org/wiki/Linearizability) context. As this macro is mostly for debugging purpose we will skip it and will go ahead. Additionally we will skip the next macro from the `down_read` function - `rwsem_acquire` which is related to the [lock validator](https://www.kernel.org/doc/Documentation/locking/lockdep-design.txt) of the Linux kernel, because this is topic of other part. The only two things that remained in the `down_write` function is the call of the `LOCK_CONTENDED` macro which is defined in the [include/linux/lockdep.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/lockdep.h) header file and setting of owner of a lock with the `rwsem_set_owner` function which sets owner to currently running process: @@ -292,7 +292,7 @@ if (rwsem_optimistic_spin(sem)) return sem; ``` -We will skip implementation of the `rwsem_optimistic_spin` function, as it is similar on the `mutex_optimistic_spin` function which we saw in the [previous part](https://0xax.gitbook.io/linux-insides/summary/syncprim/linux-sync-4). In short words we check existence other tasks ready to run that have higher priority in the `rwsem_optimistic_spin` function. If there are such tasks, the process will be added to the [MCS](http://www.cs.rochester.edu/~scott/papers/1991_TOCS_synch.pdf) `waitqueue` and start to spin in the loop until a lock will be able to be acquired. If `optimistic spinning` is disabled, a process will be added to the and marked as waiting for write: +We will skip implementation of the `rwsem_optimistic_spin` function, as it is similar on the `mutex_optimistic_spin` function which we saw in the [previous part](https://0xax.gitbook.io/linux-insides/summary/syncprim/linux-sync-4). In short words we check existence other tasks ready to run that have higher priority in the `rwsem_optimistic_spin` function. If there are such tasks, the process will be added to the [MCS](http://www.cs.rochester.edu/~scott/papers/1991_TOCS_synch.pdf) `waitqueue` and start to spin in the loop until a lock will be able to be acquired. If `optimistic spinning` is disabled, a process will be added to the `wait_list` and marked as waiting for write: ```C waiter.task = current; @@ -356,9 +356,9 @@ static inline void __down_read(struct rw_semaphore *sem) } ``` -which increments value of the given `rw_semaphore->count` and call the `call_rwsem_down_read_failed` if this value is negative. In other way we jump at the label `1:` and exit. After this `read` lock will be successfully acquired. Notice that we check a sign of the `count` value as it may be negative, because as you may remember most significant [word](https://en.wikipedia.org/wiki/Word_%28computer_architecture%29) of the `rw_semaphore->count` contains negated number of active writers. +which increments value of the given `rw_semaphore->count` and calls the `call_rwsem_down_read_failed` if this value is negative. In other way we jump at the label `1:` and exit. After this `read` lock will be successfully acquired. Notice that we check a sign of the `count` value as it may be negative, because as you may remember most significant [word](https://en.wikipedia.org/wiki/Word_%28computer_architecture%29) of the `rw_semaphore->count` contains negated number of active writers. -Let's consider case when a process wants to acquire a lock for `read` operation, but it is already locked. In this case the `call_rwsem_down_read_failed` function from the [arch/x86/lib/rwsem.S](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/lib/rwsem.S) assembly file will be called. If you will look at the implementation of this function, you will notice that it does the same that `call_rwsem_down_read_failed` function does. Except it calls the `rwsem_down_read_failed` function instead of `rwsem_dow_write_failed`. Now let's consider implementation of the `rwsem_down_read_failed` function. It starts from the adding a process to the `wait queue` and updating of value of the `rw_semaphore->counter`: +Let's consider case when a process wants to acquire a lock for `read` operation, but it is already locked. In this case the `call_rwsem_down_read_failed` function from the [arch/x86/lib/rwsem.S](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/lib/rwsem.S) assembly file will be called. If you will look at the implementation of this function, you will notice that it does the same that `call_rwsem_down_write_failed` function does. Except it calls the `rwsem_down_read_failed` function instead of `rwsem_down_write_failed`. Now let's consider implementation of the `rwsem_down_read_failed` function. It starts from the adding a process to the `wait queue` and updating of value of the `rw_semaphore->counter`: ```C long adjustment = -RWSEM_ACTIVE_READ_BIAS; diff --git a/SyncPrim/linux-sync-6.md b/SyncPrim/linux-sync-6.md index 023087d6..048bbcc4 100644 --- a/SyncPrim/linux-sync-6.md +++ b/SyncPrim/linux-sync-6.md @@ -4,20 +4,20 @@ Synchronization primitives in the Linux kernel. Part 6. Introduction -------------------------------------------------------------------------------- -This is the sixth part of the chapter which describes [synchronization primitives](https://en.wikipedia.org/wiki/Synchronization_\(computer_science\)) in the Linux kernel and in the previous parts we finished to consider different [readers-writer lock](https://en.wikipedia.org/wiki/Readers%E2%80%93writer_lock) synchronization primitives. We will continue to learn synchronization primitives in this part and start to consider a similar synchronization primitive which can be used to avoid the `writer starvation` problem. The name of this synchronization primitive is - `seqlock` or `sequential locks`. +This is the sixth part of the chapter which describes [synchronization primitives](https://en.wikipedia.org/wiki/Synchronization_(computer_science)) in the Linux kernel and in the previous parts we finished to consider different [readers-writer lock](https://en.wikipedia.org/wiki/Readers%E2%80%93writer_lock) synchronization primitives. We will continue to learn synchronization primitives in this part and start to consider a similar synchronization primitive which can be used to avoid the `writer starvation` problem. The name of this synchronization primitive is - `seqlock` or `sequential locks`. We know from the previous [part](https://0xax.gitbook.io/linux-insides/summary/syncprim/linux-sync-5) that [readers-writer lock](https://en.wikipedia.org/wiki/Readers%E2%80%93writer_lock) is a special lock mechanism which allows concurrent access for read-only operations, but an exclusive lock is needed for writing or modifying data. As we may guess, it may lead to a problem which is called `writer starvation`. In other words, a writer process can't acquire a lock as long as at least one reader process which acquired a lock holds it. So, in the situation when contention is high, it will lead to situation when a writer process which wants to acquire a lock will wait for it for a long time. The `seqlock` synchronization primitive can help solve this problem. -As in all previous parts of this [book](https://github.com/0xAX/linux-insides/blob/master/SUMMARY.md), we will try to consider this synchronization primitive from the theoretical side and only than we will consider [API](https://en.wikipedia.org/wiki/Application_programming_interface) provided by the Linux kernel to manipulate with `seqlocks`. +As in all previous parts of this [book](https://github.com/0xAX/linux-insides/blob/master/SUMMARY.md), we will try to consider this synchronization primitive from the theoretical side and only than we will consider [API](https://en.wikipedia.org/wiki/Application_programming_interface) provided by the Linux kernel to manipulate the `seqlocks`. So, let's start. Sequential lock -------------------------------------------------------------------------------- -So, what is a `seqlock` synchronization primitive and how does it work? Let's try to answer on these questions in this paragraph. Actually `sequential locks` were introduced in the Linux kernel 2.6.x. Main point of this synchronization primitive is to provide fast and lock-free access to shared resources. Since the heart of `sequential lock` synchronization primitive is [spinlock](https://0xax.gitbook.io/linux-insides/summary/syncprim/linux-sync-1) synchronization primitive, `sequential locks` work in situations where the protected resources are small and simple. Additionally write access must be rare and also should be fast. +So, what is a `seqlock` synchronization primitive and how does it work? Let's try to answer these questions in this paragraph. Actually `sequential locks` were introduced in the Linux kernel 2.6.x. Main point of this synchronization primitive is to provide fast and lock-free access to shared resources. Since the heart of `sequential lock` synchronization primitive is [spinlock](https://0xax.gitbook.io/linux-insides/summary/syncprim/linux-sync-1) synchronization primitive, `sequential locks` work in situations where the protected resources are small and simple. Additionally write access must be rare and also should be fast. Work of this synchronization primitive is based on the sequence of events counter. Actually a `sequential lock` allows free access to a resource for readers, but each reader must check existence of conflicts with a writer. This synchronization primitive introduces a special counter. The main algorithm of work of `sequential locks` is simple: Each writer which acquired a sequential lock increments this counter and additionally acquires a [spinlock](https://0xax.gitbook.io/linux-insides/summary/syncprim/linux-sync-1). When this writer finishes, it will release the acquired spinlock to give access to other writers and increment the counter of a sequential lock again. @@ -114,7 +114,7 @@ So we just initialize counter of the given sequential lock to zero and additiona #endif ``` -As I already wrote in previous parts of this [chapter](https://0xax.gitbook.io/linux-insides/summary/syncprim) we will not consider [debugging](https://en.wikipedia.org/wiki/Debugging) and [lock validator](https://www.kernel.org/doc/Documentation/locking/lockdep-design.txt) related stuff in this part. So for now we just skip the `SEQCOUNT_DEP_MAP_INIT` macro. The second field of the given `seqlock_t` is `lock` initialized with the `__SPIN_LOCK_UNLOCKED` macro which is defined in the [include/linux/spinlock_types.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/spinlock_types.h) header file. We will not consider implementation of this macro here as it just initialize [rawspinlock](https://0xax.gitbook.io/linux-insides/summary/syncprim/linux-sync-1) with architecture-specific methods (More abot spinlocks you may read in first parts of this [chapter](https://0xax.gitbook.io/linux-insides/summary/syncprim)). +As I already wrote in previous parts of this [chapter](https://0xax.gitbook.io/linux-insides/summary/syncprim) we will not consider [debugging](https://en.wikipedia.org/wiki/Debugging) and [lock validator](https://www.kernel.org/doc/Documentation/locking/lockdep-design.txt) related stuff in this part. So for now we just skip the `SEQCOUNT_DEP_MAP_INIT` macro. The second field of the given `seqlock_t` is `lock` initialized with the `__SPIN_LOCK_UNLOCKED` macro which is defined in the [include/linux/spinlock_types.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/spinlock_types.h) header file. We will not consider implementation of this macro here as it just initializes [rawspinlock](https://0xax.gitbook.io/linux-insides/summary/syncprim/linux-sync-1) with architecture-specific methods (More about spinlocks you may read in first parts of this [chapter](https://0xax.gitbook.io/linux-insides/summary/syncprim)). We have considered the first way to initialize a sequential lock. Let's consider second way to do the same, but do it dynamically. We can initialize a sequential lock with the `seqlock_init` macro which is defined in the same [include/linux/seqlock.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/seqlock.h) header file. @@ -164,7 +164,7 @@ static inline void read_seqlock_excl(seqlock_t *sl) static inline void read_sequnlock_excl(seqlock_t *sl) ``` -and others. Before we move on to considering the implementation of this [API](https://en.wikipedia.org/wiki/Application_programming_interface), we must know that actually there are two types of readers. The first type of reader never blocks a writer process. In this case writer will not wait for readers. The second type of reader which can lock. In this case, the locking reader will block the writer as it will wait while reader will not release its lock. +and others. Before we move on to considering the implementation of this [API](https://en.wikipedia.org/wiki/Application_programming_interface), we must know that there actually are two types of readers. The first type of reader never blocks a writer process. In this case writer will not wait for readers. The second type of reader which can lock. In this case, the locking reader will block the writer as it will wait while reader will not release its lock. First of all let's consider the first type of readers. The `read_seqbegin` function begins a seq-read [critical section](https://en.wikipedia.org/wiki/Critical_section). @@ -281,7 +281,7 @@ static inline void raw_write_seqcount_begin(seqcount_t *s) } ``` -When a writer process will finish to modify data, the `write_sequnlock` function must be called to release a lock and give access to other writers or readers. Let's consider at the implementation of the `write_sequnlock` function. It looks pretty simple: +When a writer process will finish to modify data, the `write_sequnlock` function must be called to release a lock and give access to other writers or readers. Let's consider the implementation of the `write_sequnlock` function. It looks pretty simple: ```C static inline void write_sequnlock(seqlock_t *sl) @@ -321,7 +321,7 @@ static inline void write_sequnlock_irq(seqlock_t *sl) As we may see, these functions differ only in the initialization of spinlock. They call `spin_lock_irq` and `spin_unlock_irq` instead of `spin_lock` and `spin_unlock`. -Or for example `write_seqlock_irqsave` and `write_sequnlock_irqrestore` functions which are the same but used `spin_lock_irqsave` and `spin_unlock_irqsave` macro to use in [IRQ](https://en.wikipedia.org/wiki/Interrupt_request_\(PC_architecture\)) handlers. +Or for example `write_seqlock_irqsave` and `write_sequnlock_irqrestore` functions which are the same but used `spin_lock_irqsave` and `spin_unlock_irqsave` macro to use in [IRQ](https://en.wikipedia.org/wiki/Interrupt_request_(PC_architecture)) handlers. That's all. @@ -338,7 +338,7 @@ Links -------------------------------------------------------------------------------- * [synchronization primitives](https://en.wikipedia.org/wiki/Synchronization_\(computer_science\)) -* [readers-writer lock](https://en.wikipedia.org/wiki/Readers%E2%80%93writer_lock) +* [readers-writer lock](https://en.wikipedia.org/wiki/Readers%E2%80%93writer_lock) * [spinlock](https://0xax.gitbook.io/linux-insides/summary/syncprim/linux-sync-1) * [critical section](https://en.wikipedia.org/wiki/Critical_section) * [lock validator](https://www.kernel.org/doc/Documentation/locking/lockdep-design.txt) diff --git a/SysCall/README.md b/SysCall/README.md index 2e65c50c..9643006e 100644 --- a/SysCall/README.md +++ b/SysCall/README.md @@ -1,6 +1,6 @@ # System calls -This chapter describes the `system call` concept in the linux kernel. +This chapter describes the `system call` concept in the Linux kernel. * [Introduction to system call concept](linux-syscall-1.md) - this part is introduction to the `system call` concept in the Linux kernel. * [How the Linux kernel handles a system call](linux-syscall-2.md) - this part describes how the Linux kernel handles a system call from a userspace application. diff --git a/SysCall/images/ls_shell.png b/SysCall/images/ls_shell.png index a34e7930..653dd092 100644 Binary files a/SysCall/images/ls_shell.png and b/SysCall/images/ls_shell.png differ diff --git a/SysCall/linux-syscall-2.md b/SysCall/linux-syscall-2.md index 09051494..c22e0578 100644 --- a/SysCall/linux-syscall-2.md +++ b/SysCall/linux-syscall-2.md @@ -51,7 +51,7 @@ asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { }; ``` -As we can see, the `sys_call_table` is an array of `__NR_syscall_max + 1` size where the `__NR_syscall_max` macro represents the maximum number of system calls for the given [architecture](https://en.wikipedia.org/wiki/List_of_CPU_architectures). This book is about the [x86_64](https://en.wikipedia.org/wiki/X86-64) architecture, so for our case the `__NR_syscall_max` is `547` and this is the correct number at the time of writing (current Linux kernel version is `5.0.0-rc7`). We can see this macro in the header file generated by [Kbuild](https://www.kernel.org/doc/Documentation/kbuild/makefiles.txt) during kernel compilation - include/generated/asm-offsets.h`: +As we can see, the `sys_call_table` is an array of `__NR_syscall_max + 1` size where the `__NR_syscall_max` macro represents the maximum number of system calls for the given [architecture](https://en.wikipedia.org/wiki/List_of_CPU_architectures). This book is about the [x86_64](https://en.wikipedia.org/wiki/X86-64) architecture, so for our case the `__NR_syscall_max` is `547` and this is the correct number at the time of writing (current Linux kernel version is `5.0.0-rc7`). We can see this macro in the header file generated by [Kbuild](https://www.kernel.org/doc/Documentation/kbuild/makefiles.txt) during kernel compilation - `include/generated/asm-offsets.h`: ```C #define __NR_syscall_max 547 @@ -126,7 +126,7 @@ SYSCALL invokes an OS system-call handler at privilege level 0. It does so by loading RIP from the IA32_LSTAR MSR ``` -it means that we need to put the system call entry in to the `IA32_LSTAR` [model specific register](https://en.wikipedia.org/wiki/Model-specific_register). This operation takes place during the Linux kernel initialization process. If you have read the fourth [part](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-4) of the chapter that describes interrupts and interrupt handling in the Linux kernel, you know that the Linux kernel calls the `trap_init` function during the initialization process. This function is defined in the [arch/x86/kernel/setup.c](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/setup.c) source code file and executes the initialization of the `non-early` exception handlers like divide error, [coprocessor](https://en.wikipedia.org/wiki/Coprocessor) error etc. Besides the initialization of the `non-early` exceptions handlers, this function calls the `cpu_init` function from the [arch/x86/kernel/cpu/common.c](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/cpu/common.c) source code file which besides initialization of `per-cpu` state, calls the `syscall_init` function from the same source code file. +It means that we need to put the system call entry in to the `IA32_LSTAR` [model specific register](https://en.wikipedia.org/wiki/Model-specific_register). This operation takes place during the Linux kernel initialization process. If you have read the fourth [part](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-4) of the chapter that describes interrupts and interrupt handling in the Linux kernel, you know that the Linux kernel calls the `trap_init` function during the initialization process. This function is defined in the [arch/x86/kernel/setup.c](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/setup.c) source code file and executes the initialization of the `non-early` exception handlers like divide error, [coprocessor](https://en.wikipedia.org/wiki/Coprocessor) error, etc. Besides the initialization of the `non-early` exceptions handlers, this function calls the `cpu_init` function from the [arch/x86/kernel/cpu/common.c](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/cpu/common.c) source code file which besides initialization of `per-cpu` state, calls the `syscall_init` function from the same source code file. This function performs the initialization of the system call entry point. Let's look on the implementation of this function. It does not take parameters and first of all it fills two model specific registers: @@ -135,7 +135,7 @@ wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); wrmsrl(MSR_LSTAR, entry_SYSCALL_64); ``` -The first model specific register - `MSR_STAR` contains `63:48` bits of the user code segment. These bits will be loaded to the `CS` and `SS` segment registers for the `sysret` instruction which provides functionality to return from a system call to user code with the related privilege. Also the `MSR_STAR` contains `47:32` bits from the kernel code that will be used as the base selector for `CS` and `SS` segment registers when user space applications execute a system call. In the second line of code we fill the `MSR_LSTAR` register with the `entry_SYSCALL_64` symbol that represents system call entry. The `entry_SYSCALL_64` is defined in the [arch/x86/entry/entry_64.S](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/entry/entry_64.S) assembly file and contains code related to the preparation performed before a system call handler will be executed (I already wrote about these preparations, read above). We will not consider the `entry_SYSCALL_64` now, but will return to it later in this chapter. +The first model specific register - `MSR_STAR` contains `63:48` bits of the user code segment. These bits will be loaded to the `CS` and `SS` segment registers for the `sysret` instruction which provides functionality to return from a system call to user code with the related privilege. Also the `MSR_STAR` contains `47:32` bits from the kernel code that will be used as the base selector for `CS` and `SS` segment registers when user space applications execute a system call. In the second line of code we fill the `MSR_LSTAR` register with the `entry_SYSCALL_64` symbol that represents system call entry. The `entry_SYSCALL_64` is defined in the [arch/x86/entry/entry_64.S](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/entry/entry_64.S) assembly file and contains code related to the preparation performed before a system call handler is executed (I already wrote about these preparations, read above). We will not consider the `entry_SYSCALL_64` now, but will return to it later in this chapter. After we have set the entry point for system calls, we need to set the following model specific registers: @@ -193,10 +193,10 @@ wrmsrl(MSR_SYSCALL_MASK, These flags will be cleared during syscall initialization. That's all, it is the end of the `syscall_init` function and it means that system call entry is ready to work. Now we can see what will occur when a user application executes the `syscall` instruction. -Preparation before system call handler will be called +Preparation before system call handler is called -------------------------------------------------------------------------------- -As I already wrote, before a system call or an interrupt handler will be called by the Linux kernel we need to do some preparations. The `idtentry` macro performs the preparations required before an exception handler will be executed, the `interrupt` macro performs the preparations required before an interrupt handler will be called and the `entry_SYSCALL_64` will do the preparations required before a system call handler will be executed. +As I already wrote, before a system call or an interrupt handler is called by the Linux kernel we need to do some preparations. The `idtentry` macro performs the preparations required before an exception handler is executed, the `interrupt` macro performs the preparations required before an interrupt handler is called and the `entry_SYSCALL_64` will do the preparations required before a system call handler is executed. The `entry_SYSCALL_64` is defined in the [arch/x86/entry/entry_64.S](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/entry/entry_64.S) assembly file and starts from the following macro: diff --git a/SysCall/linux-syscall-3.md b/SysCall/linux-syscall-3.md index 193ba185..742ea5d4 100644 --- a/SysCall/linux-syscall-3.md +++ b/SysCall/linux-syscall-3.md @@ -36,7 +36,7 @@ static inline void map_vsyscall(void) {} #endif ``` -As we can read in the help text, the `CONFIG_X86_VSYSCALL_EMULATION` configuration option: `Enable vsyscall emulation`. Why emulate `vsyscall`? Actually, the `vsyscall` is a legacy [ABI](https://en.wikipedia.org/wiki/Application_binary_interface) due to security reasons. Virtual system calls have fixed addresses, meaning that `vsyscall` page is still at the same location every time and the location of this page is determined in the `map_vsyscall` function. Let's look on the implementation of this function: +As we can read in the help text, the `CONFIG_X86_VSYSCALL_EMULATION` configuration option: `Enable vsyscall emulation`. Why emulate `vsyscall`? Actually, the `vsyscall` is a legacy [ABI](https://en.wikipedia.org/wiki/Application_binary_interface) due to security reasons. Virtual system calls have fixed addresses, meaning that `vsyscall` page is still at the same location every time and the location of this page is determined in the `map_vsyscall` function. Let's look on the implementation of this function: ```C void __init map_vsyscall(void) @@ -223,7 +223,7 @@ do_ret: regs->ip = caller; regs->sp += 8; return true; -``` +``` That's all. Now let's look on the modern concept - `vDSO`. @@ -263,10 +263,10 @@ static int __init init_vdso(void) #ifdef CONFIG_X86_X32_ABI init_vdso_image(&vdso_image_x32); -#endif +#endif ``` -Both functions initialize the `vdso_image` structure. This structure is defined in the two generated source code files: the [arch/x86/entry/vdso/vdso-image-64.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/entry/vdso/vdso-image-64.c) and the [arch/x86/entry/vdso/vdso-image-32.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/entry/vdso/vdso-image-32.c). These source code files generated by the [vdso2c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/entry/vdso/vdso2c.c) program from the different source code files, represent different approaches to call a system call like `int 0x80`, `sysenter` and etc. The full set of the images depends on the kernel configuration. +Both functions initialize the `vdso_image` structure. This structure is defined in the two generated source code files: the [arch/x86/entry/vdso/vdso-image-64.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/entry/vdso/vdso-image-64.c) and the [arch/x86/entry/vdso/vdso-image-32.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/entry/vdso/vdso-image-32.c). These source code files generated by the [vdso2c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/entry/vdso/vdso2c.c) program from the different source code files, represent different approaches to call a system call like `int 0x80`, `sysenter`, etc. The full set of the images depends on the kernel configuration. For example for the `x86_64` Linux kernel it will contain `vdso_image_64`: @@ -296,7 +296,7 @@ If our kernel is configured for the `x86` architecture or for the `x86_64` and c #endif ``` -As we can understand from the name of the `vdso_image` structure, it represents image of the `vDSO` for the certain mode of the system call entry. This structure contains information about size in bytes of the `vDSO` area that always a multiple of `PAGE_SIZE` (`4096` bytes), pointer to the text mapping, start and end address of the `alternatives` (set of instructions with better alternatives for the certain type of the processor) and etc. For example `vdso_image_64` looks like this: +As we can understand from the name of the `vdso_image` structure, it represents image of the `vDSO` for the certain mode of the system call entry. This structure contains information about size in bytes of the `vDSO` area that's always a multiple of `PAGE_SIZE` (`4096` bytes), pointer to the text mapping, start and end address of the `alternatives` (set of instructions with better alternatives for the certain type of the processor), etc. For example `vdso_image_64` looks like this: ```C const struct vdso_image vdso_image_64 = { diff --git a/SysCall/linux-syscall-4.md b/SysCall/linux-syscall-4.md index 6d993c94..91076e11 100644 --- a/SysCall/linux-syscall-4.md +++ b/SysCall/linux-syscall-4.md @@ -98,9 +98,9 @@ struct user_arg_ptr envp = { .ptr.native = __envp }; return do_execveat_common(AT_FDCWD, filename, argv, envp, 0); ``` -The `do_execveat_common` function does main work - it executes a new program. This function takes similar set of arguments, but as you can see it takes five arguments instead of three. The first argument is the file descriptor that represent directory with our application, in our case the `AT_FDCWD` means that the given pathname is interpreted relative to the current working directory of the calling process. The fifth argument is flags. In our case we passed `0` to the `do_execveat_common`. We will check in a next step, so will see it latter. +The `do_execveat_common` function does main work - it executes a new program. This function takes similar set of arguments, but as you can see it takes five arguments instead of three. The first argument is the file descriptor that represent directory with our application, in our case the `AT_FDCWD` means that the given pathname is interpreted relative to the current working directory of the calling process. The fifth argument is flags. In our case we passed `0` to the `do_execveat_common`. We will check in a next step, so will see it later. -First of all the `do_execveat_common` function checks the `filename` pointer and returns if it is `NULL`. After this we check flags of the current process that limit of running processes is not exceed: +First of all the `do_execveat_common` function checks the `filename` pointer and returns if it is `NULL`. After this we check flags of the current process that limit of running processes is not exceeded: ```C if (IS_ERR(filename)) @@ -201,7 +201,7 @@ if (retval) The `bprm_mm_init` defined in the same source code file and as we can understand from the function's name, it makes initialization of the memory descriptor or in other words the `bprm_mm_init` function initializes `mm_struct` structure. This structure defined in the [include/linux/mm_types.h](https://github.com/torvalds/linux/blob/master/include/linux/mm_types.h) header file and represents address space of a process. We will not consider implementation of the `bprm_mm_init` function because we do not know many important stuff related to the Linux kernel memory manager, but we just need to know that this function initializes `mm_struct` and populate it with a temporary stack `vm_area_struct`. -After this we calculate the count of the command line arguments which are were passed to the our executable binary, the count of the environment variables and set it to the `bprm->argc` and `bprm->envc` respectively: +After this we calculate the count of the command line arguments which were passed to our executable binary, the count of the environment variables and set it to the `bprm->argc` and `bprm->envc` respectively: ```C bprm->argc = count(argv, MAX_ARG_STRINGS); @@ -274,7 +274,7 @@ and call the: search_binary_handler(bprm); ``` -function. This function goes through the list of handlers that contains different binary formats. Currently the Linux kernel supports following binary formats: +function. This function goes through the list of handlers that contains different binary formats. Currently the Linux kernel supports the following binary formats: * `binfmt_script` - support for interpreted scripts that are starts from the [#!](https://en.wikipedia.org/wiki/Shebang_%28Unix%29) line; * `binfmt_misc` - support different binary formats, according to runtime configuration of the Linux kernel; @@ -385,9 +385,9 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip, } ``` -The `start_thread_common` function fills `fs` segment register with zero and `es` and `ds` with the value of the data segment register. After this we set new values to the [instruction pointer](https://en.wikipedia.org/wiki/Program_counter), `cs` segments etc. In the end of the `start_thread_common` function we can see the `force_iret` macro that force a system call return via `iret` instruction. Ok, we prepared new thread to run in userspace and now we can return from the `exec_binprm` and now we are in the `do_execveat_common` again. After the `exec_binprm` will finish its execution we release memory for structures that was allocated before and return. +The `start_thread_common` function fills `fs` segment register with zero and `es` and `ds` with the value of the data segment register. After this we set new values to the [instruction pointer](https://en.wikipedia.org/wiki/Program_counter), `cs` segments etc. In the end of the `start_thread_common` function we can see the `force_iret` macro that forces a system call return via `iret` instruction. Ok, we prepared new thread to run in userspace and now we can return from the `exec_binprm` and now we are in the `do_execveat_common` again. After the `exec_binprm` will finish its execution we release memory for structures that was allocated before and return. -After we returned from the `execve` system call handler, execution of our program will be started. We can do it, because all context related information already configured for this purpose. As we saw the `execve` system call does not return control to a process, but code, data and other segments of the caller process are just overwritten of the program segments. The exit from our application will be implemented through the `exit` system call. +After we returned from the `execve` system call handler, execution of our program will be started. We can do it, because all context related information is already configured for this purpose. As we saw the `execve` system call does not return control to a process, but code, data and other segments of the caller process are just overwritten of the program segments. The exit from our application will be implemented through the `exit` system call. That's all. From this point our program will be executed. diff --git a/SysCall/linux-syscall-5.md b/SysCall/linux-syscall-5.md index 17278a69..4ec7470a 100644 --- a/SysCall/linux-syscall-5.md +++ b/SysCall/linux-syscall-5.md @@ -4,11 +4,11 @@ How does the `open` system call work Introduction -------------------------------------------------------------------------------- -This is the fifth part of the chapter that describes [system calls](https://en.wikipedia.org/wiki/System_call) mechanism in the Linux kernel. Previous parts of this chapter described this mechanism in general. Now I will try to describe implementation of different system calls in the Linux kernel. Previous parts from this chapter and parts from other chapters of the books describe mostly deep parts of the Linux kernel that are faintly visible or fully invisible from the userspace. But the Linux kernel code is not only about itself. The vast of the Linux kernel code provides ability to our code. Due to the linux kernel our programs can read/write from/to files and don't know anything about sectors, tracks and other parts of a disk structures, we can send data over network and don't build encapsulated network packets by hand and etc. +This is the fifth part of the chapter that describes [system calls](https://en.wikipedia.org/wiki/System_call) mechanism in the Linux kernel. Previous parts of this chapter described the mechanism of system calls in general. I will now try to describe the implementation of different system calls in the Linux kernel. Previous parts from this chapter and parts of other chapters of the book mostly described deep parts of the Linux kernel that are barely visible or invisible from userspace. However, the greatness of the Linux kernel is not its singular existence, but its ability to enable our code to perform various useful functions such as reading/writing from/to files without the knowledge of details such as sectors, tracks and other nitty gritties of the disk layout. For eg., the kernel allows programs to send data over networks without our having to encapsulate network packets by hand etc. -I don't know how about you, but it is interesting to me not only how an operating system works, but how do my software interacts with it. As you may know, our programs interacts with the kernel through the special mechanism which is called [system call](https://en.wikipedia.org/wiki/System_call). So, I've decided to write series of parts which will describe implementation and behavior of system calls which we are using every day like `read`, `write`, `open`, `close`, `dup` and etc. +I don't know how about you, but the inner workings of the operating system both fascinate and excite my curiosity greatly. As you may know, our programs interact with the kernel through a special mechanism called [system call](https://en.wikipedia.org/wiki/System_call). I will hence attempt to describe the implementation and behavior of system calls such as `read`, `write`, `open`, `close`, `dup` etc. in a series of articles. -I have decided to start from the description of the [open](http://man7.org/linux/man-pages/man2/open.2.html) system call. if you have written at least one `C` program, you should know that before we are able to read/write or execute other manipulations with a file we need to open it with the `open` function: +Let me start with the description of the simplest (and commonly used) [open](http://man7.org/linux/man-pages/man2/open.2.html) system call. if you have done any `C` programming at all, you should know that a file must be opened using the `open` system call before we are able to read/write to it. ```C #include @@ -28,12 +28,12 @@ int main(int argc, char *argv) { printf("file successfully opened\n"); } - close(fd); + close(fd); return 0; } ``` -In this case, the open is the function from standard library, but not system call. The standard library will call related system call for us. The `open` call will return a [file descriptor](https://en.wikipedia.org/wiki/File_descriptor) which is just a unique number within our process which is associated with the opened file. Now as we opened a file and got file descriptor as result of `open` call, we may start to interact with this file. We can write into, read from it and etc. List of opened file by a process is available via [proc](https://en.wikipedia.org/wiki/Procfs) filesystem: +In this case, `open` is a function from standard library, but not the system call. The standard library will call the related system call for us. The `open` call will return a [file descriptor](https://en.wikipedia.org/wiki/File_descriptor) which is just a unique number within our process which is associated with the opened file. Now as we opened a file and got file descriptor as result of `open` call, we may start to interact with this file. We can write into, read from it and etc. List of opened file by a process is available via [proc](https://en.wikipedia.org/wiki/Procfs) filesystem: ``` $ sudo ls /proc/1/fd/ @@ -42,14 +42,14 @@ $ sudo ls /proc/1/fd/ 1 11 13 15 19 20 22 24 26 28 3 31 33 35 37 39 40 42 44 46 48 5 51 54 57 59 60 62 65 7 9 ``` -I am not going to describe more details about the `open` routine from the userspace view in this post, but mostly from the kernel side. if you are not very familiar with, you can get more info in the [man page](http://man7.org/linux/man-pages/man2/open.2.html). +I am not going to describe more details about the `open` routine from the userspace view in this post, but mostly from the kernel side. If you are not very familiar with, you can get more info in the [man page](http://man7.org/linux/man-pages/man2/open.2.html). So let's start. Definition of the open system call -------------------------------------------------------------------------------- -If you have read the [fourth part](https://github.com/0xAX/linux-insides/blob/master/SysCall/linux-syscall-4.md) of the [linux-insides](https://github.com/0xAX/linux-insides/blob/master/SUMMARY.md) book, you should know that system calls are defined with the help of `SYSCALL_DEFINE` macro. So, the `open` system call is not exception. +If you have read the [fourth part](https://github.com/0xAX/linux-insides/blob/master/SysCall/linux-syscall-4.md) of the [linux-insides](https://github.com/0xAX/linux-insides/blob/master/SUMMARY.md) book, you should know that system calls are defined with the help of `SYSCALL_DEFINE` macro. So, the `open` system call is no exception. Definition of the `open` system call is located in the [fs/open.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/fs/open.c) source code file and looks pretty small for the first view: @@ -63,7 +63,7 @@ SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode) } ``` -As you may guess, the `do_sys_open` function from the [same](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/fs/open.c) source code file does the main job. But before this function will be called, let's consider the `if` clause from which the implementation of the `open` system call starts: +As you may guess, the `do_sys_open` function from the [same](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/fs/open.c) source code file does the main job. But before this function is called, let's consider the `if` clause from which the implementation of the `open` system call starts: ```C if (force_o_largefile()) @@ -81,22 +81,22 @@ As we may read in the [GNU C Library Reference Manual](https://www.gnu.org/softw > off_t > -> This is a signed integer type used to represent file sizes. +> This is a signed integer type used to represent file sizes. > In the GNU C Library, this type is no narrower than int. -> If the source is compiled with _FILE_OFFSET_BITS == 64 this +> If the source is compiled with _FILE_OFFSET_BITS == 64 this > type is transparently replaced by off64_t. and > off64_t > -> This type is used similar to off_t. The difference is that +> This type is used similar to off_t. The difference is that > even on 32 bit machines, where the off_t type would have 32 bits, > off64_t has 64 bits and so is able to address files up to 2^63 bytes -> in length. When compiling with _FILE_OFFSET_BITS == 64 this type +> in length. When compiling with _FILE_OFFSET_BITS == 64 this type > is available under the name off_t. -So it is not hard to guess that the `off_t`, `off64_t` and `O_LARGEFILE` are about a file size. In the case of the Linux kernel, the `O_LARGEFILE` is used to disallow opening large files on 32bit systems if the caller didn't specify `O_LARGEFILE` flag during opening of a file. On 64bit systems we force on this flag in open system call. And the `force_o_largefile` macro from the [include/linux/fcntl.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/fcntl.h#L7) linux kernel header file confirms this: +So it is not hard to guess that the `off_t`, `off64_t` and `O_LARGEFILE` are about a file size. In the case of the Linux kernel, the `O_LARGEFILE` is used to disallow opening large files on 32bit systems if the caller didn't specify `O_LARGEFILE` flag during opening of a file. On 64bit systems we force on this flag in open system call. And the `force_o_largefile` macro from the [include/linux/fcntl.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/fcntl.h#L7) Linux kernel header file confirms this: ```C #ifndef force_o_largefile @@ -108,7 +108,7 @@ This macro may be architecture-specific as for example for [IA-64](https://en.wi So, as we may see the `force_o_largefile` is just a macro which expands to the `true` value in our case of [x86_64](https://en.wikipedia.org/wiki/X86-64) architecture. As we are considering 64-bit architecture, the `force_o_largefile` will be expanded to `true` and the `O_LARGEFILE` flag will be added to the set of flags which were passed to the `open` system call. -Now as we considered meaning of the `O_LARGEFILE` flag and `force_o_largefile` macro, we can proceed to the consideration of the implementation of the `do_sys_open` function. As I wrote above, this function is defined in the [same](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/fs/open.c) source code file and looks: +Now as we considered meaning of the `O_LARGEFILE` flag and `force_o_largefile` macro, we can proceed to the consideration of the implementation of the `do_sys_open` function. As I wrote above, this function is defined in the [same](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/fs/open.c) source code file and looks: ```C long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) @@ -204,7 +204,7 @@ else op->mode = 0; ``` -Here we reset permissions in `open_flags` instance if a opened file wasn't temporary and wasn't open for creation. This is because: +Here we reset permissions in `open_flags` instance if an open file wasn't temporary and wasn't open for creation. This is because: > if neither O_CREAT nor O_TMPFILE is specified, then mode is ignored. @@ -216,7 +216,7 @@ At the next step we check that a file is not tried to be opened via [fanotify](h flags &= ~FMODE_NONOTIFY & ~O_CLOEXEC; ``` -We do this to not leak a [file descriptor](https://en.wikipedia.org/wiki/File_descriptor). By default, the new file descriptor is set to remain open across an `execve` system call, but the `open` system call supports `O_CLOEXEC` flag that can be used to change this default behaviour. So we do this to prevent leaking of a file descriptor when one thread opens a file to set `O_CLOEXEC` flag and in the same time the second process does a [fork](https://en.wikipedia.org/wiki/Fork_\(system_call\)) + [execve](https://en.wikipedia.org/wiki/Exec_\(system_call\)) and as you may remember that child will have copies of the parent's set of open file descriptors. +We do this to not leak a [file descriptor](https://en.wikipedia.org/wiki/File_descriptor). By default, the new file descriptor is set to remain open across an `execve` system call, but the `open` system call supports `O_CLOEXEC` flag that can be used to change this default behaviour. So we do this to prevent leaking of a file descriptor when one thread opens a file to set `O_CLOEXEC` flag and in the same time the second process does a [fork](https://en.wikipedia.org/wiki/Fork_(system_call)) + [execve](https://en.wikipedia.org/wiki/Exec_(system_call)) and as you may remember that child will have copies of the parent's set of open file descriptors. At the next step we check that if our flags contains `O_SYNC` flag, we apply `O_DSYNC` flag too: @@ -256,7 +256,7 @@ So, in this case the file itself is not opened, but operations like `dup`, `fcnt op->open_flag = flags; ``` -Now we have filled `open_flag` field which represents flags that will control opening of a file and `mode` that will represent `umask` of a new file if we open file for creation. There are still to fill last flags in the our `open_flags` structure. The next is `op->acc_mode` which represents access mode to a opened file. We already filled the `acc_mode` local variable with the initial value at the beginning of the `build_open_flags` and now we check last two flags related to access mode: +Now we have filled `open_flag` field which represents flags that will control opening of a file and `mode` that will represent `umask` of a new file if we open file for creation. There are still to fill last flags in our `open_flags` structure. The next is `op->acc_mode` which represents access mode to a opened file. We already filled the `acc_mode` local variable with the initial value at the beginning of the `build_open_flags` and now we check last two flags related to access mode: ```C if (flags & O_TRUNC) @@ -319,7 +319,7 @@ getname(const char __user * filename) } ``` -So, it just calls the `getname_flags` function and returns its result. The main goal of the `getname_flags` function is to copy a file path given from userland to kernel space. The `filename` structure is defined in the [include/linux/fs.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/fs.h) linux kernel header file and contains following fields: +So, it just calls the `getname_flags` function and returns its result. The main goal of the `getname_flags` function is to copy a file path given from userland to kernel space. The `filename` structure is defined in the [include/linux/fs.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/fs.h) Linux kernel header file and contains following fields: * name - pointer to a file path in kernel space; * uptr - original pointer from userland; @@ -351,7 +351,7 @@ if (IS_ERR(f)) { The main goal of this function is to resolve given path name into `file` structure which represents an opened file of a process. If something going wrong and execution of the `do_filp_open` function will be failed, we should free new file descriptor with the `put_unused_fd` or in other way the `file` structure returned by the `do_filp_open` will be stored in the file descriptor table of the current process. -Now let's take a short look at the implementation of the `do_filp_open` function. This function is defined in the [fs/namei.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/fs/namei.c) linux kernel source code file and starts from initialization of the `nameidata` structure. This structure will provide a link to a file [inode](https://en.wikipedia.org/wiki/Inode). Actually this is one of the main point of the `do_filp_open` function to acquire an `inode` by the filename given to `open` system call. After the `nameidata` structure will be initialized, the `path_openat` function will be called: +Now let's take a short look at the implementation of the `do_filp_open` function. This function is defined in the [fs/namei.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/fs/namei.c) Linux kernel source code file and starts from initialization of the `nameidata` structure. This structure will provide a link to a file [inode](https://en.wikipedia.org/wiki/Inode). Actually this is one of the main point of the `do_filp_open` function to acquire an `inode` by the filename given to `open` system call. After the `nameidata` structure will be initialized, the `path_openat` function will be called: ```C filp = path_openat(&nd, op, flags | LOOKUP_RCU); @@ -364,22 +364,22 @@ if (unlikely(filp == ERR_PTR(-ESTALE))) Note that it is called three times. Actually, the Linux kernel will open the file in [RCU](https://www.kernel.org/doc/Documentation/RCU/whatisRCU.txt) mode. This is the most efficient way to open a file. If this try will be failed, the kernel enters the normal mode. The third call is relatively rare, only in the [nfs](https://en.wikipedia.org/wiki/Network_File_System) file system is likely to be used. The `path_openat` function executes `path lookup` or in other words it tries to find a `dentry` (what the Linux kernel uses to keep track of the hierarchy of files in directories) corresponding to a path. -The `path_openat` function starts from the call of the `get_empty_flip()` function that allocates a new `file` structure with some additional checks like do we exceed amount of opened files in the system or not and etc. After we have got allocated new `file` structure we call the `do_tmpfile` or `do_o_path` functions in a case if we have passed `O_TMPFILE | O_CREATE` or `O_PATH` flags during call of the `open` system call. These both cases are quite specific, so let's consider quite usual case when we want to open already existed file and want to read/write from/to it. +The `path_openat` function starts from the call of the `get_empty_flip()` function that allocates a new `file` structure with some additional checks like do we exceed amount of opened files in the system or not and etc. After we have got allocated new `file` structure we call the `do_tmpfile` or `do_o_path` functions in a case if we have passed `O_TMPFILE | O_CREATE` or `O_PATH` flags during call of the `open` system call. Both these cases are quite specific, so let's consider quite usual case when we want to open already existed file and want to read/write from/to it. -In this case the `path_init` function will be called. This function performs some preporatory work before actual path lookup. This includes search of start position of path traversal and its metadata like `inode` of the path, `dentry inode` and etc. This can be `root` directory - `/` or current directory as in our case, because we use `AT_CWD` as starting point (see call of the `do_sys_open` at the beginning of the post). +In this case the `path_init` function will be called. This function performs some preparatory work before actual path lookup. This includes search of start position of path traversal and its metadata like `inode` of the path, `dentry inode` and etc. This can be `root` directory - `/` or current directory as in our case, because we use `AT_CWD` as starting point (see call of the `do_sys_open` at the beginning of the post). The next step after the `path_init` is the [loop](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/fs/namei.c#L3457) which executes the `link_path_walk` and `do_last`. The first function executes name resolution or in other words this function starts process of walking along a given path. It handles everything step by step except the last component of a file path. This handling includes checking of a permissions and getting a file component. As a file component is gotten, it is passed to `walk_component` that updates current directory entry from the `dcache` or asks underlying filesystem. This repeats before all path's components will not be handled in such way. After the `link_path_walk` will be executed, the `do_last` function will populate a `file` structure based on the result of the `link_path_walk`. As we reached last component of the given file path the `vfs_open` function from the `do_last` will be called. -This function is defined in the [fs/open.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/fs/open.c) linux kernel source code file and the main goal of this function is to call an `open` operation of underlying filesystem. +This function is defined in the [fs/open.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/fs/open.c) Linux kernel source code file and the main goal of this function is to call an `open` operation of underlying filesystem. That's all for now. We didn't consider **full** implementation of the `open` system call. We skip some parts like handling case when we want to open a file from other filesystem with different mount point, resolving symlinks and etc., but it should be not so hard to follow this stuff. This stuff does not included in **generic** implementation of open system call and depends on underlying filesystem. If you are interested in, you may lookup the `file_operations.open` callback function for a certain [filesystem](https://github.com/torvalds/linux/tree/master/fs). Conclusion -------------------------------------------------------------------------------- -This is the end of the fifth part of the implementation of different system calls in the Linux kernel. If you have questions or suggestions, ping me on twitter [0xAX](https://twitter.com/0xAX), drop me an [email](mailto:anotherworldofworld@gmail.com), or just create an [issue](https://github.com/0xAX/linux-internals/issues/new). In the next part, we will continue to dive into system calls in the Linux kernel and see the implementation of the [read](http://man7.org/linux/man-pages/man2/read.2.html) system call. +This is the end of the fifth part of the implementation of different system calls in the Linux kernel. If you have questions or suggestions, ping me on twitter [0xAX](https://twitter.com/0xAX), drop me an [email](mailto:anotherworldofworld@gmail.com), or just create an [issue](https://github.com/0xAX/linux-insides/issues/new). In the next part, we will continue to dive into system calls in the Linux kernel and see the implementation of the [read](http://man7.org/linux/man-pages/man2/read.2.html) system call. -**Please note that English is not my first language and I am really sorry for any inconvenience. If you find any mistakes please send me PR to [linux-insides](https://github.com/0xAX/linux-internals).** +**Please note that English is not my first language and I am really sorry for any inconvenience. If you find any mistakes please send me PR to [linux-insides](https://github.com/0xAX/linux-insides).** Links -------------------------------------------------------------------------------- @@ -389,7 +389,7 @@ Links * [file descriptor](https://en.wikipedia.org/wiki/File_descriptor) * [proc](https://en.wikipedia.org/wiki/Procfs) * [GNU C Library Reference Manual](https://www.gnu.org/software/libc/manual/html_mono/libc.html#File-Position-Primitive) -* [IA-64](https://en.wikipedia.org/wiki/IA-64) +* [IA-64](https://en.wikipedia.org/wiki/IA-64) * [x86_64](https://en.wikipedia.org/wiki/X86-64) * [opendir](http://man7.org/linux/man-pages/man3/opendir.3.html) * [fanotify](http://man7.org/linux/man-pages/man7/fanotify.7.html) diff --git a/SysCall/linux-syscall-6.md b/SysCall/linux-syscall-6.md index bc121968..3e5c6329 100644 --- a/SysCall/linux-syscall-6.md +++ b/SysCall/linux-syscall-6.md @@ -3,7 +3,7 @@ Limits on resources in Linux Each process in the system uses certain amount of different resources like files, CPU time, memory and so on. -Such resources are not infinite and each process and we should have an instrument to manage it. Sometimes it is useful to know current limits for a certain resource or to change it's value. In this post we will consider such instruments that allow us to get information about limits for a process and increase or decrease such limits. +Such resources are not infinite and each process and we should have an instrument to manage it. Sometimes it is useful to know current limits for a certain resource or to change its value. In this post we will consider such instruments that allow us to get information about limits for a process and increase or decrease such limits. We will start from userspace view and then we will look how it is implemented in the Linux kernel. @@ -175,7 +175,7 @@ if (new_rlim) { check that the given `soft` limit does not exceed `hard` limit and in a case when the given resource is the maximum number of a file descriptors that hard limit is not greater than `sysctl_nr_open` value. The value of the `sysctl_nr_open` can be found via [procfs](https://en.wikipedia.org/wiki/Procfs): ``` -~$ cat /proc/sys/fs/nr_open +~$ cat /proc/sys/fs/nr_open 1048576 ``` @@ -207,9 +207,9 @@ That's all. Conclusion -------------------------------------------------------------------------------- -This is the end of the second part that describes implementation of the system calls in the Linux kernel. If you have questions or suggestions, ping me on Twitter [0xAX](https://twitter.com/0xAX), drop me an [email](mailto:anotherworldofworld@gmail.com), or just create an [issue](https://github.com/0xAX/linux-internals/issues/new). +This is the end of the second part that describes implementation of the system calls in the Linux kernel. If you have questions or suggestions, ping me on Twitter [0xAX](https://twitter.com/0xAX), drop me an [email](mailto:anotherworldofworld@gmail.com), or just create an [issue](https://github.com/0xAX/linux-insides/issues/new). -**Please note that English is not my first language and I am really sorry for any inconvenience. If you find any mistakes please send me PR to [linux-insides](https://github.com/0xAX/linux-internals).** +**Please note that English is not my first language and I am really sorry for any inconvenience. If you find any mistakes please send me PR to [linux-insides](https://github.com/0xAX/linux-insides).** Links -------------------------------------------------------------------------------- diff --git a/Theory/images/4_level_paging.png b/Theory/images/4_level_paging.png index 70d1ffec..7f6b5a1e 100644 Binary files a/Theory/images/4_level_paging.png and b/Theory/images/4_level_paging.png differ diff --git a/Theory/linux-theory-1.md b/Theory/linux-theory-1.md index 9b7e5aaa..9a09a371 100644 --- a/Theory/linux-theory-1.md +++ b/Theory/linux-theory-1.md @@ -8,7 +8,7 @@ In the fifth [part](https://0xax.gitbook.io/linux-insides/summary/booting/linux- Yeah, there will be many different things, but many many and once again many work with **memory**. -In my view, memory management is one of the most complex parts of the Linux kernel and in system programming in general. This is why we need to get acquainted with paging, before we proceed with the kernel initialization stuff. +In my view, memory management is one of the most complex parts of the Linux kernel and system programming in general. This is why we need to get acquainted with paging, before we proceed with the kernel initialization stuff. `Paging` is a mechanism that translates a linear memory address to a physical address. If you have read the previous parts of this book, you may remember that we saw segmentation in real mode when physical addresses are calculated by shifting a segment register by four and adding an offset. We also saw segmentation in protected mode, where we used the descriptor tables and base addresses from descriptors with offsets to calculate the physical addresses. Now we will see paging in 64-bit mode. @@ -27,7 +27,7 @@ There are three paging modes: * PAE paging; * IA-32e paging. -We will only explain the last mode here. To enable the `IA-32e paging` paging mode we need to do following things: +We will only explain the last mode here. To enable the `IA-32e paging` paging mode we need to do the following things: * set the `CR0.PG` bit; * set the `CR4.PAE` bit; @@ -171,7 +171,7 @@ This solution is `sign extension`. Here we can see that the lower 48 bits of a v * Kernel space * Userspace -Userspace occupies the lower part of the virtual address space, from `0x000000000000000` to `0x00007fffffffffff` and kernel space occupies the highest part from `0xffff8000000000` to `0xffffffffffffffff`. Note that bits `63:48` is 0 for userspace and 1 for kernel space. All addresses which are in kernel space and in userspace or in other words which higher `63:48` bits are zeroes or ones are called `canonical` addresses. There is a `non-canonical` area between these memory regions. Together these two memory regions (kernel space and user space) are exactly `2^48` bits wide. We can find the virtual memory map with 4 level page tables in the [Documentation/x86/x86_64/mm.txt](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/Documentation/x86/x86_64/mm.txt): +Userspace occupies the lower part of the virtual address space, from `0x000000000000000` to `0x00007fffffffffff` and kernel space occupies the highest part from `0xffff8000000000` to `0xffffffffffffffff`. Note that bits `63:47` is 0 for userspace and 1 for kernel space. All addresses which are in kernel space and in userspace or in other words which higher `63:48` bits are zeroes or ones are called `canonical` addresses. There is a `non-canonical` area between these memory regions. Together these two memory regions (kernel space and user space) are exactly `2^48` bits wide. We can find the virtual memory map with 4 level page tables in the [Documentation/x86/x86_64/mm.txt](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/Documentation/x86/x86_64/mm.txt): ``` 0000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm @@ -211,7 +211,7 @@ Usually kernel's `.text` starts here with the `CONFIG_PHYSICAL_START` offset. We ``` readelf -s vmlinux | grep ffffffff81000000 - 1: ffffffff81000000 0 SECTION LOCAL DEFAULT 1 + 1: ffffffff81000000 0 SECTION LOCAL DEFAULT 1 65099: ffffffff81000000 0 NOTYPE GLOBAL DEFAULT 1 _text 90766: ffffffff81000000 0 NOTYPE GLOBAL DEFAULT 1 startup_64 ``` diff --git a/Theory/linux-theory-2.md b/Theory/linux-theory-2.md index 091d1d29..404b5c12 100644 --- a/Theory/linux-theory-2.md +++ b/Theory/linux-theory-2.md @@ -1,7 +1,7 @@ Executable and Linkable Format ================================================================================ -ELF (Executable and Linkable Format) is a standard file format for executable files, object code, shared libraries and core dumps. Linux and many UNIX-like operating systems use this format. Let's look at the structure of the ELF-64 Object File Format and some definitions in the linux kernel source code which related with it. +ELF (Executable and Linkable Format) is a standard file format for executable files, object code, shared libraries and core dumps. Linux and many UNIX-like operating systems use this format. Let's look at the structure of the ELF-64 Object File Format and some definitions in the Linux kernel source code which related with it. An ELF object file consists of the following parts: @@ -13,7 +13,7 @@ Now let's have a closer look on these components. **ELF header** -The ELF header is located at the beginning of the object file. Its main purpose is to locate all other parts of the object file. The File header contains the following fields: +The ELF header is located at the beginning of the object file. Its main purpose is to locate all other parts of the object file. The file header contains the following fields: * ELF identification - array of bytes which helps identify the file as an ELF object file and also provides information about general object file characteristic; * Object file type - identifies the object file type. This field can describe that ELF file is a relocatable object file, an executable file, etc...; @@ -26,7 +26,7 @@ The ELF header is located at the beginning of the object file. Its main purpose * Size of a program header table entry; * and other fields... -You can find the `elf64_hdr` structure which presents ELF64 header in the linux kernel source code: +You can find the `elf64_hdr` structure which presents ELF64 header in the Linux kernel source code: ```C typedef struct elf64_hdr { @@ -64,7 +64,7 @@ All data stores in a sections in an Elf object file. Sections identified by inde * Address alignment boundary; * Size of entries, if section has table; -And presented with the following `elf64_shdr` structure in the linux kernel: +And presented with the following `elf64_shdr` structure in the Linux kernel: ```C typedef struct elf64_shdr { @@ -100,7 +100,7 @@ typedef struct elf64_phdr { } Elf64_Phdr; ``` -in the linux kernel source code. +in the Linux kernel source code. `elf64_phdr` defined in the same [elf.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/uapi/linux/elf.h#L254). @@ -109,12 +109,12 @@ The ELF object file also contains other fields/structures which you can find in vmlinux -------------------------------------------------------------------------------- -`vmlinux` is also a relocatable ELF object file . We can take a look at it with the `readelf` util. First of all let's look at the header: +`vmlinux` is also a relocatable ELF object file . We can take a look at it with the `readelf` utility. First of all let's look at the header: ``` $ readelf -h vmlinux ELF Header: - Magic: 7f 45 4c 46 02 01 01 00 00 00 00 00 00 00 00 00 + Magic: 7f 45 4c 46 02 01 01 00 00 00 00 00 00 00 00 00 Class: ELF64 Data: 2's complement, little endian Version: 1 (current) @@ -147,7 +147,7 @@ We can then look this address up in the `vmlinux` ELF object with: ``` $ readelf -s vmlinux | grep ffffffff81000000 - 1: ffffffff81000000 0 SECTION LOCAL DEFAULT 1 + 1: ffffffff81000000 0 SECTION LOCAL DEFAULT 1 65099: ffffffff81000000 0 NOTYPE GLOBAL DEFAULT 1 _text 90766: ffffffff81000000 0 NOTYPE GLOBAL DEFAULT 1 startup_64 ``` @@ -205,9 +205,9 @@ Program Headers: Segment Sections... 00 .text .notes __ex_table .rodata __bug_table .pci_fixup .builtin_fw .tracedata __ksymtab __ksymtab_gpl __kcrctab __kcrctab_gpl - __ksymtab_strings __param __modver - 01 .data .vvar - 02 .data..percpu + __ksymtab_strings __param __modver + 01 .data .vvar + 02 .data..percpu 03 .init.text .init.data .x86_cpu_dev.init .altinstructions .altinstr_replacement .iommu_table .apicdrivers .exit.text .smp_locks .data_nosave .bss .brk diff --git a/Theory/linux-theory-3.md b/Theory/linux-theory-3.md index a6f25f00..14856afc 100644 --- a/Theory/linux-theory-3.md +++ b/Theory/linux-theory-3.md @@ -57,7 +57,7 @@ __asm__ [volatile] [goto] (AssemblerTemplate [ : GotoLabels ]); ``` -All parameters which are marked with squared brackets are optional. You may notice that if we skip the optional parameters and the modifiers `volatile` and `goto` we obtain the `basic` form. +All parameters which are marked with squared brackets are optional. You may notice that if we skip the optional parameters and the modifiers `volatile` and `goto` we obtain the `basic` form. Let's start to consider this in order. The first optional `qualifier` is `volatile`. This specifier tells the compiler that an assembly statement may produce `side effects`. In this case we need to prevent compiler optimizations related to the given assembly statement. In simple terms the `volatile` specifier instructs the compiler not to modify the statement and place it exactly where it was in the original code. As an example let's look at the following function from the [Linux kernel](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/desc.h): @@ -157,7 +157,7 @@ These are input operands - variables `a` and `b`. We already know what the `r` q First of all our values `5` and `10` will be put at the stack and then these values will be moved to the two general purpose registers: `%rdx` and `%rax`. -This way the `%rax` register is used for storing the value of the `b` as well as storing the result of the calculation. **NOTE** that I've used `gcc 6.3.1` version, so the resulted code of your compiler may differ. +This way the `%rax` register is used for storing the value of the `b` as well as storing the result of the calculation. **NOTE** that I've used `gcc 6.3.1` version, so the resulted code of your compiler may differ. We have looked at input and output parameters of an inline assembly statement. Before we move on to other constraints supported by `gcc`, there is one remaining part of the inline assembly statement we have not discussed yet - `clobbers`. @@ -190,7 +190,7 @@ If we look at the assembly output: 400525: 48 01 d0 add %rdx,%rax ``` -we will see that the `%rdx` register is overwritten with `0x64` or `100` and the result will be `110` instead of `10`. Now if we add the `%rdx` register to the list of `clobbered` registers: +we will see that the `%rdx` register is overwritten with `0x64` or `100` and the result will be `110` instead of `15`. Now if we add the `%rdx` register to the list of `clobbered` registers: ```C __asm__("movq $100, %%rdx\t\n" @@ -231,7 +231,7 @@ int main(void) { unsigned long a[3] = {10000000000, 0, 1}; unsigned long b = 5; - + __asm__ volatile("incq %0" :: "m" (a[0])); printf("a[0] - b = %lu\n", a[0] - b); @@ -288,12 +288,12 @@ Now the result is correct. If we look at the assembly output again: ```assembly 00000000004004f6
: 400404: 48 b8 00 e4 0b 54 02 movabs $0x2540be400,%rax - 40040b: 00 00 00 + 40040b: 00 00 00 40040e: 48 89 04 24 mov %rax,(%rsp) 400412: 48 c7 44 24 08 00 00 movq $0x0,0x8(%rsp) - 400419: 00 00 + 400419: 00 00 40041b: 48 c7 44 24 10 01 00 movq $0x1,0x10(%rsp) - 400422: 00 00 + 400422: 00 00 400424: 48 ff 04 24 incq (%rsp) 400428: 48 8b 04 24 mov (%rsp),%rax 400431: 48 8d 70 fb lea -0x5(%rax),%rsi @@ -306,14 +306,14 @@ we will see one difference here which is in the last two lines: 400431: 48 8d 70 fb lea -0x5(%rax),%rsi ``` -Instead of constant folding, `GCC` now preserves calculations in the assembly and places the value of `a[0]` in the `%rax` register afterwards. In the end it just subtracts the constant value of `b` from the `%rax` register and puts result to the `%rsi`. +Instead of constant folding, `GCC` now preserves calculations in the assembly and places the value of `a[0]` in the `%rax` register afterwards. In the end it just subtracts the constant value of `b` from the `%rax` register and puts the result to the `%rsi`. Besides the `memory` specifier, we also see a new constraint here - `m`. This constraint tells the compiler to use the address of `a[0]`, instead of its value. So, now we are finished with `clobbers` and we may continue by looking at other constraints supported by `GCC` besides `r` and `m` which we have already seen. Constraints --------------------------------------------------------------------------------- -Now that we are finished with all three parts of an inline assembly statement, let's return to constraints. We already saw some constraints in the previous parts, like `r` which represents a `register` operand, `m` which represents a memory operand and `0-9` which represent an reused, indexed operand. Besides these `GCC` provides support for other constraints. For example the `i` constraint represents an `immediate` integer operand with know value: +Now that we are finished with all three parts of an inline assembly statement, let's return to constraints. We already saw some constraints in the previous parts, like `r` which represents a `register` operand, `m` which represents a memory operand and `0-9` which represent a reused, indexed operand. Besides these `GCC` provides support for other constraints. For example the `i` constraint represents an `immediate` integer operand with known value: ```C #include @@ -388,7 +388,7 @@ int main(void) { static unsigned long arr[3] = {0, 1, 2}; static unsigned long element; - + __asm__ volatile("movq 16+%1, %0" : "=r"(element) : "o"(arr)); printf("%lu\n", element); return 0; @@ -434,7 +434,7 @@ That's about all of the commonly used constraints in inline assembly statements. Architecture specific constraints -------------------------------------------------------------------------------- -Before we finish, let's look at the set of special constraints. These constrains are architecture specific and as this book is specific to the [x86_64](https://en.wikipedia.org/wiki/X86-64) architecture, we will look at constraints related to it. First of all the set of `a` ... `d` and also `S` and `D` constraints represent [generic purpose](https://en.wikipedia.org/wiki/Processor_register) registers. In this case the `a` constraint corresponds to `%al`, `%ax`, `%eax` or `%rax` register depending on instruction size. The `S` and `D` constraints are `%si` and `%di` registers respectively. For example let's take our previous example. We can see in its assembly output that value of the `a` variable is stored in the `%eax` register. Now let's look at the assembly output of the same assembly, but with other constraint: +Before we finish, let's look at the set of special constraints. These constrains are architecture specific and as this book is specific to the [x86_64](https://en.wikipedia.org/wiki/X86-64) architecture, we will look at constraints related to it. First of all the set of `a` ... `d` and also `S` and `D` constraints represent [generic purpose](https://en.wikipedia.org/wiki/Processor_register) registers. In this case the `a` constraint corresponds to `%al`, `%ax`, `%eax` or `%rax` register depending on instruction size. The `S` and `D` constraints are `%si` and `%di` registers respectively. For example let's take our previous example. We can see in its assembly output that value of the `a` variable is stored in the `%eax` register. Now let's look at the assembly output of the same assembly, but with other constraint: ```C #include @@ -464,7 +464,7 @@ Links -------------------------------------------------------------------------------- * [Linux kernel source code](https://github.com/torvalds/linux) -* [assembly programming language](https://en.wikipedia.org/wiki/Assembly_language) +* [assembly programming language](https://en.wikipedia.org/wiki/Assembly_language) * [GCC](https://en.wikipedia.org/wiki/GNU_Compiler_Collection) * [GNU extension](https://gcc.gnu.org/onlinedocs/gcc/C-Extensions.html) * [Global Descriptor Table](https://en.wikipedia.org/wiki/Global_Descriptor_Table) diff --git a/Timers/README.md b/Timers/README.md index cd975849..a1e895f6 100644 --- a/Timers/README.md +++ b/Timers/README.md @@ -1,6 +1,6 @@ # Timers and time management -This chapter describes timers and time management related concepts in the linux kernel. +This chapter describes timers and time management related concepts in the Linux kernel. * [Introduction](linux-timers-1.md) - An introduction to the timers in the Linux kernel. * [Introduction to the clocksource framework](linux-timers-2.md) - Describes `clocksource` framework in the Linux kernel. diff --git a/Timers/images/HZ.png b/Timers/images/HZ.png index 388ef231..93a46093 100644 Binary files a/Timers/images/HZ.png and b/Timers/images/HZ.png differ diff --git a/Timers/images/base_small.png b/Timers/images/base_small.png index 75084003..c5bfdfbf 100644 Binary files a/Timers/images/base_small.png and b/Timers/images/base_small.png differ diff --git a/Timers/linux-timers-1.md b/Timers/linux-timers-1.md index 7037aab3..e01a7135 100644 --- a/Timers/linux-timers-1.md +++ b/Timers/linux-timers-1.md @@ -371,7 +371,7 @@ u64 get_jiffies_64(void) EXPORT_SYMBOL(get_jiffies_64); ``` -Note that the `get_jiffies_64` function does not implemented as `jiffies_read` for example: +Note that the `get_jiffies_64` function is not implemented as `jiffies_read` for example: ```C static cycle_t jiffies_read(struct clocksource *cs) diff --git a/Timers/linux-timers-2.md b/Timers/linux-timers-2.md index 16786b87..647c4d90 100644 --- a/Timers/linux-timers-2.md +++ b/Timers/linux-timers-2.md @@ -79,15 +79,15 @@ As you can see, the `jiffies` variable is very widely used in the Linux kernel [ Introduction to `clocksource` -------------------------------------------------------------------------------- -The `clocksource` concept represents the generic API for clock sources management in the Linux kernel. Why do we need a separate framework for this? Let's go back to the beginning. The `time` concept is the fundamental concept in the Linux kernel and other operating system kernels. And the timekeeping is one of the necessities to use this concept. For example Linux kernel must know and update the time elapsed since system startup, it must determine how long the current process has been running for every processor and many many more. Where the Linux kernel can get information about time? First of all it is Real Time Clock or [RTC](https://en.wikipedia.org/wiki/Real-time_clock) that represents by the a nonvolatile device. You can find a set of architecture-independent real time clock drivers in the Linux kernel in the [drivers/rtc](https://github.com/torvalds/linux/tree/master/drivers/rtc) directory. Besides this, each architecture can provide a driver for the architecture-dependent real time clock, for example - `CMOS/RTC` - [arch/x86/kernel/rtc.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/rtc.c) for the [x86](https://en.wikipedia.org/wiki/X86) architecture. The second is system timer - timer that excites [interrupts](https://en.wikipedia.org/wiki/Interrupt) with a periodic rate. For example, for [IBM PC](https://en.wikipedia.org/wiki/IBM_Personal_Computer) compatibles it was - [programmable interval timer](https://en.wikipedia.org/wiki/Programmable_interval_timer). +The `clocksource` concept represents the generic API for clock sources management in the Linux kernel. Why do we need a separate framework for this? Let's go back to the beginning. The `time` concept is the fundamental concept in the Linux kernel and other operating system kernels. And the timekeeping is one of the necessities to use this concept. For example Linux kernel must know and update the time elapsed since system startup, it must determine how long the current process has been running for every processor and many many more. Where the Linux kernel can get information about time? First of all it is Real Time Clock or [RTC](https://en.wikipedia.org/wiki/Real-time_clock) that represents the nonvolatile device. You can find a set of architecture-independent real time clock drivers in the Linux kernel in the [drivers/rtc](https://github.com/torvalds/linux/tree/master/drivers/rtc) directory. Besides this, each architecture can provide a driver for the architecture-dependent real time clock, for example - `CMOS/RTC` - [arch/x86/kernel/rtc.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/rtc.c) for the [x86](https://en.wikipedia.org/wiki/X86) architecture. The second is system timer - timer that excites [interrupts](https://en.wikipedia.org/wiki/Interrupt) with a periodic rate. For example, for [IBM PC](https://en.wikipedia.org/wiki/IBM_Personal_Computer) compatibles it was - [programmable interval timer](https://en.wikipedia.org/wiki/Programmable_interval_timer). We already know that for timekeeping purposes we can use `jiffies` in the Linux kernel. The `jiffies` can be considered as read only global variable which is updated with `HZ` frequency. We know that the `HZ` is a compile-time kernel parameter whose reasonable range is from `100` to `1000` [Hz](https://en.wikipedia.org/wiki/Hertz). So, it is guaranteed to have an interface for time measurement with `1` - `10` milliseconds resolution. Besides standard `jiffies`, we saw the `refined_jiffies` clock source in the previous part that is based on the `i8253/i8254` [programmable interval timer](https://en.wikipedia.org/wiki/Programmable_interval_timer) tick rate which is almost `1193182` hertz. So we can get something about `1` microsecond resolution with the `refined_jiffies`. In this time, [nanoseconds](https://en.wikipedia.org/wiki/Nanosecond) are the favorite choice for the time value units of the given `clocksource`. -The availability of more precise techniques for time intervals measurement is hardware-dependent. We just knew a little about `x86` dependent timers hardware. But each architecture provides own timers hardware. Earlier each architecture had own implementation for this purpose. Solution of this problem is an abstraction layer and associated API in a common code framework for managing various clock sources and independent of the timer interrupt. This common code framework became - `clocksource` framework. +The availability of more precise techniques for time intervals measurement is hardware-dependent. We just knew a little about `x86` dependent timers hardware. But each architecture provides its own timer(s) hardware. Earlier each architecture had own implementation for this purpose. Solution of this problem is an abstraction layer and associated API in a common code framework for managing various clock sources and independent of the timer interrupt. This common code framework became - `clocksource` framework. Generic timeofday and `clocksource` management framework moved a lot of timekeeping code into the architecture independent portion of the code, with the architecture-dependent portion reduced to defining and managing low-level hardware pieces of clocksources. It takes a large amount of funds to measure the time interval on different architectures with different hardware, and it is very complex. Implementation of the each clock related service is strongly associated with an individual hardware device and as you can understand, it results in similar implementations for different architectures. -Within this framework, each clock source is required to maintain a representation of time as a monotonically increasing value. As we can see in the Linux kernel code, nanoseconds are the favorite choice for the time value units of a clock source in this time. One of the main point of the clock source framework is to allow a user to select clock source among a range of available hardware devices supporting clock functions when configuring the system and selecting, accessing and scaling different clock sources. +Within this framework, each clock source is required to maintain a representation of time as a monotonically increasing value. As we can see in the Linux kernel code, nanoseconds are the favorite choice for the time value units of a clock source at this time. One of the main point of the clock source framework is to allow a user to select clock source among a range of available hardware devices supporting clock functions when configuring the system and selecting, accessing and scaling different clock sources. The `clocksource` structure -------------------------------------------------------------------------------- @@ -123,7 +123,7 @@ struct clocksource { } ____cacheline_aligned; ``` -We already saw the first field of the `clocksource` structure in the previous part - it is pointer to the `read` function that returns best counter selected by the clocksource framework. For example we use `jiffies_read` function to read `jiffies` value: +We already saw the first field of the `clocksource` structure in the previous part - it is a pointer to the `read` function that returns best counter selected by the clocksource framework. For example we use `jiffies_read` function to read `jiffies` value: ```C static struct clocksource clocksource_jiffies = { @@ -171,7 +171,7 @@ is not `100%` accurate. Instead the number is taken as close as possible to a na * `suspend` - suspend function for the clocksource; * `resume` - resume function for the clocksource; -The next field is the `max_cycles` and as we can understand from its name, this field represents maximum cycle value before potential overflow. And the last field is `owner` represents reference to a kernel [module](https://en.wikipedia.org/wiki/Loadable_kernel_module) that is owner of a clocksource. This is all. We just went through all the standard fields of the `clocksource` structure. But you can noted that we missed some fields of the `clocksource` structure. We can divide all of missed field on two types: Fields of the first type are already known for us. For example, they are `name` field that represents name of a `clocksource`, the `rating` field that helps to the Linux kernel to select the best clocksource and etc. The second type, fields which are dependent from the different Linux kernel configuration options. Let's look on these fields. +The next field is the `max_cycles` and as we can understand from its name, this field represents maximum cycle value before potential overflow. And the last field is `owner` represents reference to a kernel [module](https://en.wikipedia.org/wiki/Loadable_kernel_module) that is owner of a clocksource. This is all. We just went through all the standard fields of the `clocksource` structure. But you might have noted that we missed some fields of the `clocksource` structure. We can divide all of missed field on two types: Fields of the first type are already known for us. For example, they are `name` field that represents name of a `clocksource`, the `rating` field that helps to the Linux kernel to select the best clocksource and etc. The second type, fields which are dependent from the different Linux kernel configuration options. Let's look on these fields. The first field is the `archdata`. This field has `arch_clocksource_data` type and depends on the `CONFIG_ARCH_CLOCKSOURCE_DATA` kernel configuration option. This field is actual only for the [x86](https://en.wikipedia.org/wiki/X86) and [IA64](https://en.wikipedia.org/wiki/IA-64) architectures for this moment. And again, as we can understand from the field's name, it represents architecture-specific data for a clock source. For example, it represents `vDSO` clock mode: @@ -180,7 +180,7 @@ struct arch_clocksource_data { int vclock_mode; }; ``` - + for the `x86` architectures. Where the `vDSO` clock mode can be one of the: ```C @@ -190,7 +190,7 @@ for the `x86` architectures. Where the `vDSO` clock mode can be one of the: #define VCLOCK_PVCLOCK 3 ``` -The last three fields are `wd_list`, `cs_last` and the `wd_last` depends on the `CONFIG_CLOCKSOURCE_WATCHDOG` kernel configuration option. First of all let's try to understand what is it `watchdog`. In a simple words, watchdog is a timer that is used for detection of the computer malfunctions and recovering from it. All of these three fields contain watchdog related data that is used by the `clocksource` framework. If we will grep the Linux kernel source code, we will see that only [arch/x86/KConfig](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/Kconfig#L54) kernel configuration file contains the `CONFIG_CLOCKSOURCE_WATCHDOG` kernel configuration option. So, why do `x86` and `x86_64` need in [watchdog](https://en.wikipedia.org/wiki/Watchdog_timer)? You already may know that all `x86` processors has special 64-bit register - [time stamp counter](https://en.wikipedia.org/wiki/Time_Stamp_Counter). This register contains number of [cycles](https://en.wikipedia.org/wiki/Clock_rate) since the reset. Sometimes the time stamp counter needs to be verified against another clock source. We will not see initialization of the `watchdog` timer in this part, before this we must learn more about timers. +The last three fields are `wd_list`, `cs_last` and the `wd_last` depends on the `CONFIG_CLOCKSOURCE_WATCHDOG` kernel configuration option. First of all let's try to understand what is `watchdog`. In a simple words, watchdog is a timer that is used for detection of the computer malfunctions and recovering from it. All of these three fields contain watchdog related data that is used by the `clocksource` framework. If we will grep the Linux kernel source code, we will see that only [arch/x86/KConfig](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/Kconfig#L54) kernel configuration file contains the `CONFIG_CLOCKSOURCE_WATCHDOG` kernel configuration option. So, why do `x86` and `x86_64` need in [watchdog](https://en.wikipedia.org/wiki/Watchdog_timer)? You already may know that all `x86` processors has special 64-bit register - [time stamp counter](https://en.wikipedia.org/wiki/Time_Stamp_Counter). This register contains number of [cycles](https://en.wikipedia.org/wiki/Clock_rate) since the reset. Sometimes the time stamp counter needs to be verified against another clock source. We will not see initialization of the `watchdog` timer in this part, before this we must learn more about timers. That's all. From this moment we know all fields of the `clocksource` structure. This knowledge will help us to learn insides of the `clocksource` framework. @@ -241,9 +241,9 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) } ``` -First of all we can see that the `__clocksource_register_scale` function starts from the call of the `__clocksource_update_freq_scale` function that defined in the same source code file and updates given clock source with the new frequency. Let's look on the implementation of this function. In the first step we need to check given frequency and if it was not passed as `zero`, we need to calculate `mult` and `shift` parameters for the given clock source. Why do we need to check value of the `frequency`? Actually it can be zero. if you attentively looked on the implementation of the `__clocksource_register` function, you may have noticed that we passed `frequency` as `0`. We will do it only for some clock sources that have self defined `mult` and `shift` parameters. Look in the previous [part](https://0xax.gitbook.io/linux-insides/summary/timers/linux-timers-1) and you will see that we saw calculation of the `mult` and `shift` for `jiffies`. The `__clocksource_update_freq_scale` function will do it for us for other clock sources. +First of all we can see that the `__clocksource_register_scale` function starts from the call of the `__clocksource_update_freq_scale` function that defined in the same source code file and updates given clock source with the new frequency. Let's look on the implementation of this function. In the first step we need to check given frequency and if it was not passed as `zero`, we need to calculate `mult` and `shift` parameters for the given clock source. Why do we need to check value of the `frequency`? Actually it can be zero. If you attentively looked on the implementation of the `__clocksource_register` function, you may have noticed that we passed `frequency` as `0`. We will do it only for some clock sources that have self defined `mult` and `shift` parameters. Look in the previous [part](https://0xax.gitbook.io/linux-insides/summary/timers/linux-timers-1) and you will see that we saw calculation of the `mult` and `shift` for `jiffies`. The `__clocksource_update_freq_scale` function will do it for us for other clock sources. -So in the start of the `__clocksource_update_freq_scale` function we check the value of the `frequency` parameter and if is not zero we need to calculate `mult` and `shift` for the given clock source. Let's look on the `mult` and `shift` calculation: +So in the start of the `__clocksource_update_freq_scale` function we check the value of the `frequency` parameter and if it is not zero we need to calculate `mult` and `shift` for the given clock source. Let's look on the `mult` and `shift` calculation: ```C void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq) @@ -259,7 +259,7 @@ void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq sec = 1; else if (sec > 600 && cs->mask > UINT_MAX) sec = 600; - + clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, NSEC_PER_SEC / scale, sec * scale); } @@ -406,21 +406,21 @@ and creation of three files: These files will provide information about current clock source in the system, available clock sources in the system and interface which allows to unbind the clock source. -After the `init_clocksource_sysfs` function will be executed, we will be able find some information about available clock sources in the: +After the `init_clocksource_sysfs` function is executed, we will be able to find some information about available clock sources in the: ``` -$ cat /sys/devices/system/clocksource/clocksource0/available_clocksource -tsc hpet acpi_pm +$ cat /sys/devices/system/clocksource/clocksource0/available_clocksource +tsc hpet acpi_pm ``` Or for example information about current clock source in the system: ``` -$ cat /sys/devices/system/clocksource/clocksource0/current_clocksource +$ cat /sys/devices/system/clocksource/clocksource0/current_clocksource tsc ``` -In the previous part, we saw API for the registration of the `jiffies` clock source, but didn't dive into details about the `clocksource` framework. In this part we did it and saw implementation of the new clock source registration and selection of a clock source with the best rating value in the system. Of course, this is not all API that `clocksource` framework provides. There a couple additional functions like `clocksource_unregister` for removing given clock source from the `clocksource_list` and etc. But I will not describe this functions in this part, because they are not important for us right now. Anyway if you are interesting in it, you can find it in the [kernel/time/clocksource.c](https://github.com/torvalds/linux/tree/master/kernel/time/clocksource.c). +In the previous part, we saw API for the registration of the `jiffies` clock source, but didn't dive into details about the `clocksource` framework. In this part we did it and saw implementation of the new clock source registration and selection of a clock source with the best rating value in the system. Of course, this is not all API that `clocksource` framework provides. There a couple additional functions like `clocksource_unregister` for removing given clock source from the `clocksource_list` and etc. But I will not describe this functions in this part, because they are not important for us right now. Anyway if you are interested in it, you can find it in the [kernel/time/clocksource.c](https://github.com/torvalds/linux/tree/master/kernel/time/clocksource.c). That's all. diff --git a/Timers/linux-timers-3.md b/Timers/linux-timers-3.md index 9658a8d6..44da1219 100644 --- a/Timers/linux-timers-3.md +++ b/Timers/linux-timers-3.md @@ -4,17 +4,17 @@ Timers and time management in the Linux kernel. Part 3. The tick broadcast framework and dyntick -------------------------------------------------------------------------------- -This is third part of the [chapter](https://0xax.gitbook.io/linux-insides/summary/timers/) which describes timers and time management related stuff in the Linux kernel and we stopped on the `clocksource` framework in the previous [part](https://0xax.gitbook.io/linux-insides/summary/timers/linux-timers-2). We have started to consider this framework because it is closely related to the special counters which are provided by the Linux kernel. One of these counters which we already saw in the first [part](https://0xax.gitbook.io/linux-insides/summary/timers/linux-timers-1) of this chapter is - `jiffies`. As I already wrote in the first part of this chapter, we will consider time management related stuff step by step during the Linux kernel initialization. Previous step was call of the: +This is third part of the [chapter](https://0xax.gitbook.io/linux-insides/summary/timers/) which describes timers and time management related stuff in the Linux kernel and we stopped on the `clocksource` framework in the previous [part](https://0xax.gitbook.io/linux-insides/summary/timers/linux-timers-2). We have started to consider this framework because it is closely related to the special counters which are provided by the Linux kernel. One of these counters which we already saw in the first [part](https://0xax.gitbook.io/linux-insides/summary/timers/linux-timers-1.md) of this chapter is - `jiffies`. As I already wrote in the first part of this chapter, we will consider time management related stuff step by step during the Linux kernel initialization. Previous step was call of the: ```C register_refined_jiffies(CLOCK_TICK_RATE); ``` -function which defined in the [kernel/time/jiffies.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/time/jiffies.c) source code file and executes initialization of the `refined_jiffies` clock source for us. Recall that this function is called from the `setup_arch` function that defined in the [arch/x86/kernel/setup.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/setup.c) source code and executes architecture-specific ([x86_64](https://en.wikipedia.org/wiki/X86-64) in our case) initialization. Look on the implementation of the `setup_arch` and you will note that the call of the `register_refined_jiffies` is the last step before the `setup_arch` function will finish its work. +function which is defined in the [kernel/time/jiffies.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/time/jiffies.c) source code file and executes initialization of the `refined_jiffies` clock source for us. Recall that this function is called from the `setup_arch` function that is defined in the [arch/x86/kernel/setup.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/setup.c) source code and executes architecture-specific ([x86_64](https://en.wikipedia.org/wiki/X86-64) in our case) initialization. Look on the implementation of the `setup_arch` and you will note that the call of the `register_refined_jiffies` is the last step before the `setup_arch` function finishes its work. -There are many different `x86_64` specific things already configured after the end of the `setup_arch` execution. For example some early [interrupt](https://en.wikipedia.org/wiki/Interrupt) handlers already able to handle interrupts, memory space reserved for the [initrd](https://en.wikipedia.org/wiki/Initrd), [DMI](https://en.wikipedia.org/wiki/Desktop_Management_Interface) scanned, the Linux kernel log buffer is already set and this means that the [printk](https://en.wikipedia.org/wiki/Printk) function is able to work, [e820](https://en.wikipedia.org/wiki/E820) parsed and the Linux kernel already knows about available memory and and many many other architecture specific things (if you are interesting, you can read more about the `setup_arch` function and Linux kernel initialization process in the second [chapter](https://0xax.gitbook.io/linux-insides/summary/initialization) of this book). +There are many different `x86_64` specific things already configured after the end of the `setup_arch` execution. For example some early [interrupt](https://en.wikipedia.org/wiki/Interrupt) handlers already able to handle interrupts, memory space reserved for the [initrd](https://en.wikipedia.org/wiki/Initrd), [DMI](https://en.wikipedia.org/wiki/Desktop_Management_Interface) scanned, the Linux kernel log buffer is already set and this means that the [printk](https://en.wikipedia.org/wiki/Printk) function is able to work, [e820](https://en.wikipedia.org/wiki/E820) parsed and the Linux kernel already knows about available memory and and many many other architecture specific things (if you are interested, you can read more about the `setup_arch` function and Linux kernel initialization process in the second [chapter](https://0xax.gitbook.io/linux-insides/summary/initialization) of this book). -Now, the `setup_arch` finished its work and we can back to the generic Linux kernel code. Recall that the `setup_arch` function was called from the `start_kernel` function which is defined in the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c) source code file. So, we shall return to this function. You can see that there are many different function are called right after `setup_arch` function inside of the `start_kernel` function, but since our chapter is devoted to timers and time management related stuff, we will skip all code which is not related to this topic. The first function which is related to the time management in the Linux kernel is: +Now, the `setup_arch` finished its work and we can go back to the generic Linux kernel code. Recall that the `setup_arch` function was called from the `start_kernel` function which is defined in the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c) source code file. So, we shall return to this function. You can see that there are many different functions that are called right after `setup_arch` function inside of the `start_kernel` function, but since our chapter is devoted to timers and time management related stuff, we will skip all code which is not related to this topic. The first function which is related to the time management in the Linux kernel is: ```C tick_init(); @@ -25,12 +25,12 @@ in the `start_kernel`. The `tick_init` function defined in the [kernel/time/tick * Initialization of `tick broadcast` framework related data structures; * Initialization of `full` tickless mode related data structures. -We didn't see anything related to the `tick broadcast` framework in this book and didn't know anything about tickless mode in the Linux kernel. So, the main point of this part is to look on these concepts and to know what are they. +We didn't see anything related to the `tick broadcast` framework in this book and didn't know anything about tickless mode in the Linux kernel. So, the main point of this part is to look on these concepts and to know what they are. The idle process -------------------------------------------------------------------------------- -First of all, let's look on the implementation of the `tick_init` function. As I already wrote, this function defined in the [kernel/time/tick-common.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/time/tick-common.c) source code file and consists from the two calls of following functions: +First of all, let's look on the implementation of the `tick_init` function. As I already wrote, this function is defined in the [kernel/time/tick-common.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/time/tick-common.c) source code file and consists from the two calls of following functions: ```C void __init tick_init(void) @@ -51,16 +51,17 @@ static void cpu_idle_loop(void) { while (1) { while (!need_resched()) { + ... + ... + ... + /* the main idle function */ + cpuidle_idle_call(); + } ... ... ... - /* the main idle function */ - cpuidle_idle_call(); + schedule_preempt_disabled(); } - ... - ... - ... - schedule_preempt_disabled(); } ``` @@ -74,7 +75,7 @@ Whenever the idle task is selected to run, the periodic tick is disabled with th The second way is to omit scheduling-clock ticks on processors that are either in `idle` state or that have only one runnable task or in other words busy processor. We can enable this feature with the `CONFIG_NO_HZ_FULL` kernel configuration option and it allows to reduce the number of timer interrupts significantly. -Besides the `cpu_idle_loop`, idle processor can be in a sleeping state. The Linux kernel provides special `cpuidle` framework. Main point of this framework is to put an idle processor to sleeping states. The name of the set of these states is - `C-states`. But how does a processor will be woken if local timer is disabled? The linux kernel provides `tick broadcast` framework for this. The main point of this framework is assign a timer which is not affected by the `C-states`. This timer will wake a sleeping processor. +Besides the `cpu_idle_loop`, idle processor can be in a sleeping state. The Linux kernel provides special `cpuidle` framework. Main point of this framework is to put an idle processor to sleeping states. The name of the set of these states is - `C-states`. But how will a processor will be woken if local timer is disabled? The Linux kernel provides `tick broadcast` framework for this. The main point of this framework is assign a timer which is not affected by the `C-states`. This timer will wake a sleeping processor. Now, after some theory we can return to the implementation of our function. Let's recall that the `tick_init` function just calls two following functions: @@ -117,7 +118,7 @@ Ultimately, the memory space will be allocated for the given `cpumask` with the *mask = kmalloc_node(cpumask_size(), flags, node); ``` -Now let's look on the `cpumasks` that will be initialized in the `tick_broadcast_init` function. As we can see, the `tick_broadcast_init` function will initialize six `cpumasks`, and moreover, initialization of the last three `cpumasks` will be depended on the `CONFIG_TICK_ONESHOT` kernel configuration option. +Now let's look on the `cpumasks` that will be initialized in the `tick_broadcast_init` function. As we can see, the `tick_broadcast_init` function will initialize six `cpumasks`, and moreover, initialization of the last three `cpumasks` will depend on the `CONFIG_TICK_ONESHOT` kernel configuration option. The first three `cpumasks` are: @@ -130,7 +131,7 @@ As we already know, the next three `cpumasks` depends on the `CONFIG_TICK_ONESHO * `periodic` - clock events devices that support periodic events; * `oneshot` - clock events devices that capable of issuing events that happen only once. -The linux kernel defines two mask for such clock events devices in the [include/linux/clockchips.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/clockchips.h) header file: +The Linux kernel defines two mask for such clock events devices in the [include/linux/clockchips.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/clockchips.h) header file: ```C #define CLOCK_EVT_FEAT_PERIODIC 0x000001 @@ -157,7 +158,7 @@ struct tick_device { }; ``` -Note, that the `tick_device` structure contains two fields. The first field - `evtdev` represents pointer to the `clock_event_device` structure that defined in the [include/linux/clockchips.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/clockchips.h) header file and represents descriptor of a clock event device. A `clock event` device allows to register an event that will happen in the future. As I already wrote, we will not consider `clock_event_device` structure and related API in this part, but will see it in the next part. +Note, that the `tick_device` structure contains two fields. The first field - `evtdev` represents pointer to the `clock_event_device` structure that is defined in the [include/linux/clockchips.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/clockchips.h) header file and represents descriptor of a clock event device. A `clock event` device allows to register an event that will happen in the future. As I already wrote, we will not consider `clock_event_device` structure and related API in this part, but will see it in the next part. The second field of the `tick_device` structure represents mode of the `tick_device`. As we already know, the mode can be one of the: @@ -208,7 +209,7 @@ First of all we get the current `clock event` device from the `tick_broadcast_de static struct tick_device tick_broadcast_device; ``` -and represents external clock device that keeps track of events for a processor. The first step after we got the current clock device is the call of the `tick_check_broadcast_device` function which checks that a given clock events device can be utilized as broadcast device. The main point of the `tick_check_broadcast_device` function is to check value of the `features` field of the given `clock events` device. As we can understand from the name of this field, the `features` field contains a clock event device features. Available values defined in the [include/linux/clockchips.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/clockchips.h) header file and can be one of the `CLOCK_EVT_FEAT_PERIODIC` - which represents a clock events device which supports periodic events and etc. So, the `tick_check_broadcast_device` function check `features` flags for `CLOCK_EVT_FEAT_ONESHOT`, `CLOCK_EVT_FEAT_DUMMY` and other flags and returns `false` if the given clock events device has one of these features. In other way the `tick_check_broadcast_device` function compares `ratings` of the given clock event device and current clock event device and returns the best. +and represents external clock device that keeps track of events for a processor. The first step after we get the current clock device is the call of the `tick_check_broadcast_device` function which checks that a given clock events device can be utilized as broadcast device. The main point of the `tick_check_broadcast_device` function is to check value of the `features` field of the given `clock events` device. As we can understand from the name of this field, the `features` field contains a clock event device features. Available values defined in the [include/linux/clockchips.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/clockchips.h) header file and can be one of the `CLOCK_EVT_FEAT_PERIODIC` - which represents a clock events device which supports periodic events and etc. So, the `tick_check_broadcast_device` function check `features` flags for `CLOCK_EVT_FEAT_ONESHOT`, `CLOCK_EVT_FEAT_DUMMY` and other flags and returns `false` if the given clock events device has one of these features. In other way the `tick_check_broadcast_device` function compares `ratings` of the given clock event device and current clock event device and returns the best. After the `tick_check_broadcast_device` function, we can see the call of the `try_module_get` function that checks module owner of the clock events. We need to do it to be sure that the given `clock events` device was correctly initialized. The next step is the call of the `clockevents_exchange_device` function that defined in the [kernel/time/clockevents.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/time/clockevents.c) source code file and will release old clock events device and replace the previous functional handler with a dummy handler. @@ -292,7 +293,7 @@ static irqreturn_t hpet_interrupt_handler(int irq, void *data) } ``` -The `hpet_interrupt_handler` gets the [irq](https://en.wikipedia.org/wiki/Interrupt_request_%28PC_architecture%29) specific data and check the event handler of the `clock event` device. Recall that we just set in the `tick_set_periodic_handler` function. So the `tick_handler_periodic_broadcast` function will be called in the end of the high precision event timer interrupt handler. +The `hpet_interrupt_handler` gets the [IRQ](https://en.wikipedia.org/wiki/Interrupt_request_%28PC_architecture%29) specific data and check the event handler of the `clock event` device. Recall that we just set in the `tick_set_periodic_handler` function. So the `tick_handler_periodic_broadcast` function will be called in the end of the high precision event timer interrupt handler. The `tick_handler_periodic_broadcast` function calls the @@ -314,7 +315,7 @@ if (bc_local) td->evtdev->event_handler(td->evtdev); ``` -which actually represents interrupt handler of the local timer of a processor. After this a processor will wake up. That is all about `tick broadcast` framework in the Linux kernel. We have missed some aspects of this framework, for example reprogramming of a `clock event` device and broadcast with the oneshot timer and etc. But the Linux kernel is very big, it is not real to cover all aspects of it. I think it will be interesting to dive into with yourself. +which actually represents interrupt handler of the local timer of a processor. After this a processor will wake up. That is all about `tick broadcast` framework in the Linux kernel. We have missed some aspects of this framework, for example reprogramming of a `clock event` device and broadcast with the oneshot timer and etc. But the Linux kernel is very big, it is not realistic to cover all aspects of it. I think it will be interesting to dive into it yourself. If you remember, we have started this part with the call of the `tick_init` function. We just consider the `tick_broadcast_init` function and related theory, but the `tick_init` function contains another call of a function and this function is - `tick_nohz_init`. Let's look on the implementation of this function. @@ -435,7 +436,7 @@ Links * [NO_HZ documentation](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/Documentation/timers/NO_HZ.txt) * [cpumasks](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-2) * [high precision event timer](https://en.wikipedia.org/wiki/High_Precision_Event_Timer) -* [irq](https://en.wikipedia.org/wiki/Interrupt_request_%28PC_architecture%29) +* [IRQ](https://en.wikipedia.org/wiki/Interrupt_request_%28PC_architecture%29) * [IPI](https://en.wikipedia.org/wiki/Inter-processor_interrupt) * [CPUID](https://en.wikipedia.org/wiki/CPUID) * [APIC](https://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller) diff --git a/book-A5.json b/book-A5.json new file mode 100644 index 00000000..2cda7624 --- /dev/null +++ b/book-A5.json @@ -0,0 +1,14 @@ +{ + "title": "Linux Insides", + "author" : "0xAX", + "pdf": { + "paperSize": "a5", + "margin": + { + "top": 48, + "bottom": 48, + "right": 28, + "left": 28 + } + } +} diff --git a/book.json b/book.json new file mode 100644 index 00000000..245a7ec9 --- /dev/null +++ b/book.json @@ -0,0 +1,4 @@ +{ + "title": "Linux Insides", + "author" : "0xAX" +} diff --git a/contributors.md b/contributors.md index 3ddfdd72..dd8d3ea2 100644 --- a/contributors.md +++ b/contributors.md @@ -1,5 +1,6 @@ -Thank you to all contributors: ------------------------------- +# Contributors + +Special thanks to all the people who helped to develop this project: * [Akash Shende](https://github.com/akash0x53) * [Jakub Kramarz](https://github.com/jkramarz) @@ -134,4 +135,9 @@ Thank you to all contributors: * [Yuxin Wu](https://github.com/chaffz) * [Biao Ding](https://github.com/SmallPond) * [Arfy slowy](https://github.com/slowy07) +* [Junbo Jiang](https://github.com/junbo42) +* [Dexter Plameras](https://github.com/dexterp) * [Jun Duan](https://github.com/waltforme) +* [Guochao Xie](https://github.com/XieGuochao) +* [Davide Benini](https://github.com/beninidavide/) +* [kyselejsyrecek](https://github.com/kyselejsyrecek) diff --git a/cover.jpg b/cover.jpg index e0ecfaea..02a3c813 100644 Binary files a/cover.jpg and b/cover.jpg differ diff --git a/scripts/check_code_snippets.py b/scripts/check_code_snippets.py new file mode 100644 index 00000000..e7bba82c --- /dev/null +++ b/scripts/check_code_snippets.py @@ -0,0 +1,84 @@ +""" +A script that takes the lines of the Linux kernel source code from the comments +in the markdown files that are attached to the code and checks their validity. +""" +import os +import re +import sys +from typing import Optional, Tuple + +import requests + +exclude_dirs = ["./.github"] + +def __split_url_and_range__(url: str) -> Tuple[str, Optional[int], Optional[int]]: + base, frag = url.split("#", 1) + m = re.match(r'L(\d+)(?:-L?(\d+))?$', frag) + start = int(m.group(1)) + end = int(m.group(2)) if m.group(2) else None + return base, start, end + +def __fetch_raw__(source: str) -> str: + r = requests.get(source, timeout=5.0) + return r.text + +def __handle_md__(md: str): + in_code = False + code = '' + content = '' + + md_lines = md.splitlines() + + for line in md_lines: + if in_code: + if re.search("^```[a-zA-Z].*", line): + continue + + if re.search("^```$", line): + in_code = False + continue + + code += line + '\n' + continue + + if line.startswith("