diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml
new file mode 100644
index 00000000..d54d09d5
--- /dev/null
+++ b/.github/FUNDING.yml
@@ -0,0 +1,3 @@
+# These are supported funding model platforms
+
+patreon: 0xAX
diff --git a/.github/ISSUE_TEMPLATE/content-issue.yml b/.github/ISSUE_TEMPLATE/content-issue.yml
new file mode 100644
index 00000000..ae88eed8
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/content-issue.yml
@@ -0,0 +1,39 @@
+name: 📖 Content issue
+description: Report an issue with the content
+body:
+ - type: markdown
+ attributes:
+ value: |
+ Use this form to report an issue with the content.
+
+ When contributing, make sure to follow Contributing guidelines and Code of Conduct.
+ Thank you for your contribution!
+
+ - type: checkboxes
+ attributes:
+ label: Existing issues
+ description: Is there an existing issue for this? Search open and closed issues to avoid duplicates.
+ options:
+ - label: I have searched the existing issues.
+ required: true
+
+ - type: input
+ attributes:
+ label: Affected document
+ description: Name or paste a link to the document that contains an issue.
+ validations:
+ required: true
+
+ - type: textarea
+ attributes:
+ label: Issue description
+ description: Explain what is unclear or confusing in the given document.
+ validations:
+ required: true
+
+ - type: textarea
+ attributes:
+ label: Attachments
+ description: Include screenshots or links if applicable.
+ validations:
+ required: false
diff --git a/.github/ISSUE_TEMPLATE/question.yml b/.github/ISSUE_TEMPLATE/question.yml
new file mode 100644
index 00000000..a1255e1e
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/question.yml
@@ -0,0 +1,39 @@
+name: ❓ Questions and discussions
+description: Ask a question or start a discussion with other community members.
+body:
+ - type: markdown
+ attributes:
+ value: |
+ Use this form to ask a question or start a discussion with other community members.
+
+ When contributing, make sure to follow Contributing guidelines and Code of Conduct.
+ Thank you for your contribution!
+
+ - type: checkboxes
+ attributes:
+ label: Existing issues
+ description: Is there an existing issue for this? Search open and closed issues to avoid duplicates.
+ options:
+ - label: I have searched the existing issues.
+ required: true
+
+ - type: textarea
+ attributes:
+ label: Question
+ description: Ask a question you would like to discuss with the community.
+ validations:
+ required: false
+
+ - type: textarea
+ attributes:
+ label: Discussion
+ description: Start a discussion topic.
+ validations:
+ required: false
+
+ - type: textarea
+ attributes:
+ label: Attachments
+ description: Include screenshots, links, or example's output if applicable.
+ validations:
+ required: false
diff --git a/.github/dependabot.yaml b/.github/dependabot.yaml
new file mode 100644
index 00000000..12301490
--- /dev/null
+++ b/.github/dependabot.yaml
@@ -0,0 +1,6 @@
+version: 2
+updates:
+ - package-ecosystem: "github-actions"
+ directory: "/"
+ schedule:
+ interval: "daily"
diff --git a/.github/pull-request-template.md b/.github/pull-request-template.md
new file mode 100644
index 00000000..ae2a5cf7
--- /dev/null
+++ b/.github/pull-request-template.md
@@ -0,0 +1,18 @@
+
+
+**Description**
+
+
+
+Changes proposed in this pull request:
+
+- ...
+- ...
+- ...
+
+**Related issues**
+
+
diff --git a/.github/workflows/check-code-snippets.yaml b/.github/workflows/check-code-snippets.yaml
new file mode 100644
index 00000000..3aa534a5
--- /dev/null
+++ b/.github/workflows/check-code-snippets.yaml
@@ -0,0 +1,32 @@
+name: check code snippets
+
+on:
+ workflow_dispatch:
+ push:
+ branches:
+ - main
+ pull_request:
+
+concurrency:
+ group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+ cancel-in-progress: true
+
+jobs:
+ check-code-snippets:
+ name: check-code-snippets
+ runs-on:
+ - ubuntu-22.04
+ steps:
+ - name: Checkout repository
+ uses: actions/checkout@v5
+ - name: Setup python
+ uses: actions/setup-python@v6
+ with:
+ python-version: '3.13'
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install requests
+ - name: Validate code snippets
+ run: |
+ python ./scripts/check_code_snippets.py .
diff --git a/.github/workflows/generate-e-books.yaml b/.github/workflows/generate-e-books.yaml
new file mode 100644
index 00000000..84cb9c7b
--- /dev/null
+++ b/.github/workflows/generate-e-books.yaml
@@ -0,0 +1,63 @@
+name: Generate e-books
+
+on:
+ pull_request:
+ branches:
+ - master
+ workflow_dispatch: {} # For manual runs.
+
+jobs:
+ build-for-pr:
+ # For every PR, build the same artifacts and make them accessible from the PR.
+ if: github.event_name == 'pull_request'
+ runs-on: ubuntu-latest
+
+ permissions:
+ contents: read
+ pull-requests: write
+
+ steps:
+ - name: Checkout repository
+ uses: actions/checkout@v5
+
+ - name: Export all supported book formats from the Docker container
+ run: |
+ make run
+ make export
+
+ - name: Copy generated files to host system
+ run: |
+ make cp
+ mkdir -p artifacts/
+ mv "Linux Inside - 0xAX.epub" \
+ "Linux Inside - 0xAX.mobi" \
+ "Linux Inside - 0xAX.pdf" \
+ "Linux Inside - 0xAX (A5).pdf" \
+ artifacts/
+
+ - name: Upload PR artifacts
+ uses: actions/upload-artifact@v4
+ with:
+ name: ebooks-${{ github.sha }}
+ path: artifacts/*
+ if-no-files-found: error
+ # Change the retention period here if necessary.
+ retention-days: 7
+
+ - name: Add a comment with a link to the generated artifacts.
+ # For forked PRs the token is read-only; skip commenting to avoid failures.
+ if: ${{ github.event.pull_request.head.repo.full_name == github.event.pull_request.base.repo.full_name }}
+ uses: actions/github-script@v8
+ env:
+ RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+ with:
+ script: |
+ const body = [
+ `E-books generated for this pull request available at: ${process.env.RUN_URL}`
+ ].join('\n');
+ await github.rest.issues.createComment({
+ owner: context.repo.owner,
+ repo: context.repo.repo,
+ issue_number: context.issue.number,
+ body
+ });
diff --git a/.github/workflows/release-e-books.yaml b/.github/workflows/release-e-books.yaml
new file mode 100644
index 00000000..ef378954
--- /dev/null
+++ b/.github/workflows/release-e-books.yaml
@@ -0,0 +1,63 @@
+name: Release e-books
+
+on:
+ push:
+ tags:
+ - 'v*.*' # Create a release only when a new tag matching v*.* is pushed.
+ # To also create a release for each push to the main branch, uncomment the following 2 lines:
+ # branches:
+ # - master
+ workflow_dispatch: {} # For manual runs.
+
+jobs:
+ release-ebooks:
+ runs-on: ubuntu-latest
+
+ permissions:
+ contents: write
+
+ steps:
+ - name: Checkout repository
+ uses: actions/checkout@v5
+
+ - name: Export all supported book formats from the Docker container
+ run: |
+ make run
+ make export
+
+ - name: Copy generated files to host system
+ run: |
+ make cp
+ mkdir -p artifacts/
+ mv "Linux Inside - 0xAX.epub" \
+ "Linux Inside - 0xAX.mobi" \
+ "Linux Inside - 0xAX.pdf" \
+ "Linux Inside - 0xAX (A5).pdf" \
+ artifacts/
+ cp LICENSE artifacts/
+
+ - name: Prepare release metadata
+ # Use tag name when running on a tag, otherwise fall back to the short commit hash.
+ id: meta
+ env:
+ GITHUB_REF_TYPE: ${{ github.ref_type }}
+ GITHUB_REF_NAME: ${{ github.ref_name }}
+ run: |
+ DATE_UTC="$(date -u '+%m/%d/%Y %H:%M')"
+ if [ "${GITHUB_REF_TYPE}" = "tag" ] && [ -n "${GITHUB_REF_NAME}" ]; then
+ LABEL="${GITHUB_REF_NAME}"
+ else
+ LABEL="$(git rev-parse --short HEAD)"
+ fi
+ echo "release_name=${DATE_UTC} (${LABEL})" >> "$GITHUB_OUTPUT"
+ echo "tag_name=${LABEL}" >> "$GITHUB_OUTPUT"
+
+ - name: Create GitHub release
+ uses: softprops/action-gh-release@v2
+ with:
+ files: artifacts/*
+ name: ${{ steps.meta.outputs.release_name }}
+ tag_name: ${{ steps.meta.outputs.tag_name }}
+ target_commitish: ${{ github.sha }}
+ generate_release_notes: true
+ fail_on_unmatched_files: true
diff --git a/Assets/linux-kernel.png b/Assets/linux-kernel.png
index 7cc0df73..5ab0a316 100644
Binary files a/Assets/linux-kernel.png and b/Assets/linux-kernel.png differ
diff --git a/Booting/README.md b/Booting/README.md
index 6ed74a78..fbcf31db 100644
--- a/Booting/README.md
+++ b/Booting/README.md
@@ -1,12 +1,12 @@
# Kernel Boot Process
-This chapter describes the linux kernel boot process. Here you will see a series of posts which describes the full cycle of the kernel loading process:
+This chapter describes the Linux kernel boot process. Here you will see a series of posts which describes the full cycle of the kernel loading process:
* [From the bootloader to kernel](linux-bootstrap-1.md) - describes all stages from turning on the computer to running the first instruction of the kernel.
* [First steps in the kernel setup code](linux-bootstrap-2.md) - describes first steps in the kernel setup code. You will see heap initialization, query of different parameters like EDD, IST and etc...
* [Video mode initialization and transition to protected mode](linux-bootstrap-3.md) - describes video mode initialization in the kernel setup code and transition to protected mode.
* [Transition to 64-bit mode](linux-bootstrap-4.md) - describes preparation for transition into 64-bit mode and details of transition.
* [Kernel Decompression](linux-bootstrap-5.md) - describes preparation before kernel decompression and details of direct decompression.
-* [Kernel random address randomization](linux-bootstrap-6.md) - describes randomization of the Linux kernel load address.
+* [Kernel load address randomization](linux-bootstrap-6.md) - describes randomization of the Linux kernel load address.
This chapter coincides with `Linux kernel v4.17`.
diff --git a/Booting/images/bss.png b/Booting/images/bss.png
index c9b802e9..b7a1e9b5 100644
Binary files a/Booting/images/bss.png and b/Booting/images/bss.png differ
diff --git a/Booting/images/early-bss.svg b/Booting/images/early-bss.svg
new file mode 100644
index 00000000..3d29cc7d
--- /dev/null
+++ b/Booting/images/early-bss.svg
@@ -0,0 +1,4 @@
+
+
+
+
\ No newline at end of file
diff --git a/Booting/images/early-heap.svg b/Booting/images/early-heap.svg
new file mode 100644
index 00000000..20c0e601
--- /dev/null
+++ b/Booting/images/early-heap.svg
@@ -0,0 +1,4 @@
+
+
+
+
\ No newline at end of file
diff --git a/Booting/images/early-page-table.svg b/Booting/images/early-page-table.svg
new file mode 100644
index 00000000..97bc6687
--- /dev/null
+++ b/Booting/images/early-page-table.svg
@@ -0,0 +1,4 @@
+
+
+
+
\ No newline at end of file
diff --git a/Booting/images/early-stack.svg b/Booting/images/early-stack.svg
new file mode 100644
index 00000000..2d493362
--- /dev/null
+++ b/Booting/images/early-stack.svg
@@ -0,0 +1,4 @@
+
+
+
+
\ No newline at end of file
diff --git a/Booting/images/kernel_first_address.png b/Booting/images/kernel_first_address.png
index 4fb0ddee..ba746f9e 100644
Binary files a/Booting/images/kernel_first_address.png and b/Booting/images/kernel_first_address.png differ
diff --git a/Booting/images/linear_address.png b/Booting/images/linear_address.png
deleted file mode 100644
index 6a9eaca0..00000000
Binary files a/Booting/images/linear_address.png and /dev/null differ
diff --git a/Booting/images/minimal_stack.png b/Booting/images/minimal_stack.png
index 123da34a..ac1ac9a1 100644
Binary files a/Booting/images/minimal_stack.png and b/Booting/images/minimal_stack.png differ
diff --git a/Booting/images/segment-descriptor.svg b/Booting/images/segment-descriptor.svg
new file mode 100644
index 00000000..fb3033af
--- /dev/null
+++ b/Booting/images/segment-descriptor.svg
@@ -0,0 +1,4 @@
+
+
+
+
\ No newline at end of file
diff --git a/Booting/images/segment-selector.svg b/Booting/images/segment-selector.svg
new file mode 100644
index 00000000..64695c60
--- /dev/null
+++ b/Booting/images/segment-selector.svg
@@ -0,0 +1,4 @@
+
+
+
+
\ No newline at end of file
diff --git a/Booting/images/simple_bootloader.png b/Booting/images/simple_bootloader.png
deleted file mode 100644
index 2443974d..00000000
Binary files a/Booting/images/simple_bootloader.png and /dev/null differ
diff --git a/Booting/images/stack1.png b/Booting/images/stack1.png
index 534be3af..78d4c999 100644
Binary files a/Booting/images/stack1.png and b/Booting/images/stack1.png differ
diff --git a/Booting/images/stack2.png b/Booting/images/stack2.png
index 9eb8f0cd..3e278950 100644
Binary files a/Booting/images/stack2.png and b/Booting/images/stack2.png differ
diff --git a/Booting/images/startup_32.svg b/Booting/images/startup_32.svg
new file mode 100644
index 00000000..69ac9502
--- /dev/null
+++ b/Booting/images/startup_32.svg
@@ -0,0 +1,4 @@
+
+
+
+
\ No newline at end of file
diff --git a/Booting/images/try_vmlinuz_in_qemu.png b/Booting/images/try_vmlinuz_in_qemu.png
index e24b925b..5b44d698 100644
Binary files a/Booting/images/try_vmlinuz_in_qemu.png and b/Booting/images/try_vmlinuz_in_qemu.png differ
diff --git a/Booting/images/video_mode_setup_menu.png b/Booting/images/video_mode_setup_menu.png
index 78530173..99162ace 100644
Binary files a/Booting/images/video_mode_setup_menu.png and b/Booting/images/video_mode_setup_menu.png differ
diff --git a/Booting/linux-bootstrap-1.md b/Booting/linux-bootstrap-1.md
index 7393b63f..b63b336f 100644
--- a/Booting/linux-bootstrap-1.md
+++ b/Booting/linux-bootstrap-1.md
@@ -1,124 +1,151 @@
-Kernel booting process. Part 1.
-================================================================================
+# Kernel Booting Process — Part 1
-From the bootloader to the kernel
---------------------------------------------------------------------------------
+If you’ve read my earlier [posts](https://github.com/0xAX/asm) about [assembly language](https://en.wikipedia.org/wiki/Assembly_language) for Linux x86_64, you might see that I started to get interested in low-level programming. I’ve written a set of articles on assembly programming for [x86_64](https://en.wikipedia.org/wiki/X86-64) Linux and, in parallel, began exploring the Linux kernel source code. I’ve always been fascinated by what happens under the hood — how programs execute on a CPU, how they’re laid out in memory, how the kernel schedules processes and manages resources, how the network stack operates at a low level, and many other details. This series is a way of sharing my journey.
-If you read my previous [blog posts](https://0xax.github.io/categories/assembler/), you might have noticed that I have been involved with low-level programming for some time. I wrote some posts about assembly programming for `x86_64` Linux and, at the same time, started to dive into the Linux kernel source code.
+> [!NOTE]
+> This is not official Linux kernel documentation, it is a learning project. I’m not a professional Linux kernel developer, and I don’t write kernel code as part of my daily job. Learning how the Linux kernel works is just my hobby. If you find anything unclear, spot an error, or have questions or suggestions, feel free to reach out - you always can ping me on X [0xAX](https://twitter.com/0xAX), send me an [email](mailto:anotherworldofworld@gmail.com) or open a new [issue](https://github.com/0xAX/linux-insides/issues/new). Your feedback is always welcome and appreciated.
-I have a great interest in understanding how low-level things work, how programs run on my computer, how they are located in memory, how the kernel manages processes and memory, how the network stack works at a low level, and many many other things. So, I decided to write yet another series of posts about the Linux kernel for the **x86_64** architecture.
+The main goal of this series is to provide a guide to the Linux kernel for readers who want to begin learning how it works. We will explore not only what the kernel does, but will try to understand how and why it does it. Despite being considered to be understandable for anyone who is interested in Linux kernel, it is highly recommended to have some prior knowledge before starting to read these notes. If you want to experiment with the kernel code, first of all it is best to have a [Linux distribution](https://en.wikipedia.org/wiki/Linux_distribution) installed. Besides that, on these pages we will see much of [C](https://en.wikipedia.org/wiki/C_(programming_language)) and [assembly](https://en.wikipedia.org/wiki/Assembly_language) code, so the good understanding of these programming languages is highly required.
-Note that I'm not a professional kernel hacker and I don't write code for the kernel at work. It's just a hobby. I just like low-level stuff, and it is interesting for me to see how these things work. So if you notice anything confusing, or if you have any questions/remarks, ping me on Twitter [0xAX](https://twitter.com/0xAX), drop me an [email](mailto:anotherworldofworld@gmail.com) or just create an [issue](https://github.com/0xAX/linux-insides/issues/new). I appreciate it.
+> [!IMPORTANT]
+> I started writing this series when the latest version of the kernel was `3.18`. A lot has changed since then, and I am in the process of updating the content to reflect modern kernels where possible — now focusing on v6.16+. I’ll continue revising the posts as the kernel evolves.
-All posts will also be accessible at [github repo](https://github.com/0xAX/linux-insides) and, if you find something wrong with my English or the post content, feel free to send a pull request.
+That’s enough introduction — let’s dive into the Linux kernel!
-*Note that this isn't official documentation, just learning and sharing knowledge.*
+## The Magic Power Button - What happens next?
-**Required knowledge**
+Although this is a series of posts about Linux kernel, we will not jump straight into kernel code. First, let’s step back and look at what happens before the kernel even comes into play. Everything starts from the turning on a computer. And we will start from this point as well.
-* Understanding C code
-* Understanding assembly code (AT&T syntax)
+When you press the "magic" power button on your laptop or desktop computer, the [motherboard](https://en.wikipedia.org/wiki/Motherboard) sends a signal to the [power supply](https://en.wikipedia.org/wiki/Power_supply). In response, the power supply delivers the proper amount of electricity to other components of the computer. Once the motherboard receives the [power good signal](https://en.wikipedia.org/wiki/Power_good_signal), it triggers the CPU to start. The CPU then performs a reset: it clears any leftover data in its registers and loads predefined values into each of them, preparing for the very first instructions of the boot process.
-Anyway, if you're just starting to learn such tools, I will try to explain some parts during this and the following posts. Alright, this is the end of the simple introduction. Let's start to dive into the Linux kernel and low-level stuff!
+Each **x86_64** processor begins execution in a special mode called [real mode](https://en.wikipedia.org/wiki/Real_mode). This mode exists for historical reasons - to be compatible with the earliest processors. Real mode is supported on all x86-compatible processors — from the original [8086](https://en.wikipedia.org/wiki/Intel_8086) to today’s modern 64-bit CPUs.
-I started writing these posts at the time of the `3.18` Linux kernel, and many things have changed since that time. If there are changes, I will update the posts accordingly.
+The **8086** was a 16-bit microprocessor. Basically it means that its general-purpose registers and instruction pointer were `16` bits wide. However, the chip was designed with a `20-bit` physical memory address bus — the set of electrical lines used to select memory locations. With `20` address lines, the CPU can form addresses from `0x00000` to `0xFFFFF`, giving access to exactly `1 MB` of physical memory or `2^20` bytes.
-The Magical Power Button, What happens next?
---------------------------------------------------------------------------------
+Because the registers on **8086** processors were only `16` bits wide, the largest value they could hold was `0xFFFF` which equals 64 KB. This means that, using just a single 16-bit value, the CPU could only directly address 64 KB of memory at a time. This leads us to the question - how can a processor with 16-bit registers access 20-bit addresses? The answer is [memory segmentation](https://en.wikipedia.org/wiki/Memory_segmentation).
-Although this is a series of posts about the Linux kernel, we won't start directly from the kernel code. As soon as you press the magical power button on your laptop or desktop computer, it starts working. The motherboard sends a signal to the [power supply](https://en.wikipedia.org/wiki/Power_supply) device. After receiving the signal, the power supply provides the proper amount of electricity to the computer. Once the motherboard receives the [power good signal](https://en.wikipedia.org/wiki/Power_good_signal), it tries to start the CPU. The CPU resets all leftover data in its registers and sets predefined values for each of them.
+To make use of the entire 1 MB space provided by the 20-bit address bus, the **8086** used a scheme called [memory segmentation](https://en.wikipedia.org/wiki/Memory_segmentation). All memory is divided into small, fixed-size segments of `65_536` bytes each. Instead of using just one value to identify a memory location, a CPU uses the two:
-The [80386](https://en.wikipedia.org/wiki/Intel_80386) and later CPUs define the following predefined data in CPU registers after the computer resets:
+1. Segment selector — identifies the starting point (base address) of a 64 KB segment. Represented by the value of the `cs` (code-segment) register.
+2. Offset — specifies how far into that segment the target address is. Represented by the value of the `ip` register.
+
+In real mode, the base address for a given segment selector is calculated as:
```
-IP 0xfff0
-CS selector 0xf000
-CS base 0xffff0000
+Base Address = Segment Selector << 4
```
-The processor starts working in [real mode](https://en.wikipedia.org/wiki/Real_mode). Let's back up a little and try to understand [memory segmentation](https://en.wikipedia.org/wiki/Memory_segmentation) in this mode. Real mode is supported on all x86-compatible processors, from the [8086](https://en.wikipedia.org/wiki/Intel_8086) CPU all the way to the modern Intel 64-bit CPUs. The `8086` processor has a 20-bit address bus, which means that it could work with a `0-0xFFFFF` or `1 megabyte` address space. But it only has `16-bit` registers, which have a maximum address of `2^16 - 1` or `0xffff` (64 kilobytes).
-
-[Memory segmentation](https://en.wikipedia.org/wiki/Memory_segmentation) is used to make use of all the address space available. All memory is divided into small, fixed-size segments of `65536` bytes (64 KB). Since we cannot address memory above `64 KB` with 16-bit registers, an alternate method was devised.
-
-An address consists of two parts: a segment selector, which has a base address; and an offset from this base address. In real mode, the associated base address of a segment selector is `Segment Selector * 16`. Thus, to get a physical address in memory, we need to multiply the segment selector part by `16` and add the offset to it:
+To compute the final physical memory address, the CPU adds the base address to the offset:
```
-PhysicalAddress = Segment Selector * 16 + Offset
+Physical Address = Base Address + Offset
```
-For example, if `CS:IP` is `0x2000:0x0010`, then the corresponding physical address will be:
+For example, if the value of the `cs:ip` is `0x2000:0x0010`, then the corresponding physical address will be:
```python
>>> hex((0x2000 << 4) + 0x0010)
'0x20010'
```
-But, if we take the largest segment selector and offset, `0xffff:0xffff`, then the resulting address will be:
+If we take the largest possible values for the segment selector and the offset - `0xFFFF:0xFFFF`, the resulting address will be:
```python
>>> hex((0xffff << 4) + 0xffff)
'0x10ffef'
```
-which is `65520` bytes past the first megabyte. Since only one megabyte is accessible in real mode, `0x10ffef` becomes `0x00ffef` with the [A20 line](https://en.wikipedia.org/wiki/A20_line) disabled.
+This gives us the address `0x10FFEF`, which is `65_520` bytes past the 1 MB boundary. Since, in real mode on the original **8086** CPU, the CPU could only access the first 1 MB of memory, any address above `0xFFFFF` would wrap around back to the beginning of the address space. On modern **386+** CPUs the physical bus is wider even in real mode, but the address computation still based on the `segment:offset`.
-Ok, now we know a little bit about real mode and its memory addressing. Let's get back to discussing register values after reset.
+Now that we understand the basics of real mode and its memory addressing limitations, let’s return to the state after a hardware reset.
-The `CS` register consists of two parts: the visible segment selector and the hidden base address. In real-address mode, the base address is normally formed by shifting the 16-bit segment selector value 4 bits to the left to produce a 20-bit base address. However, during a hardware reset the segment selector in the CS register is loaded with `0xf000` and the base address is loaded with `0xffff0000`. The processor uses this special base address until `CS` changes.
+## First code executed after reset
-The starting address is formed by adding the base address to the value in the EIP register:
+The system has just been powered on, the reset signal has been released, and the processor is waking up to execute first instructions. The [80386](https://en.wikipedia.org/wiki/Intel_80386) and later CPUs set the following [register](https://en.wikipedia.org/wiki/X86#x86_registers) values after a hardware reset:
+
+| Register | Value | Meaning |
+| ------------------ | ------------ | ------------------------------------------------------------------------------ |
+| `ip` | `0xFFF0` | Instruction pointer; execution starts here within the current code segment |
+| `cs` (selector) | `0xF000` | Visible code segment selector value after reset |
+| `cs` (base) | `0xFFFF0000` | Hidden descriptor base address loaded into `cs` during reset |
+
+In real mode, the base address is normally formed by shifting the 16-bit segment selector value 4 bits left to produce a 20-bit physical address. However, after the hardware reset the first instruction will be located at the special address. We may see that the segment selector in the `cs` register is loaded with `0xF000` but the hidden base address is loaded with `0xFFFF0000`. Instead of using the usual formula to get the address, the processor uses this value as the base address of the first instruction. Having the value of the base address and the offset (from the `ip` register), the starting address will be:
```python
->>> 0xffff0000 + 0xfff0
+>>> hex(0xffff0000 + 0xfff0)
'0xfffffff0'
```
-We get `0xfffffff0`, which is 16 bytes below 4GB. This point is called the [reset vector](https://en.wikipedia.org/wiki/Reset_vector). It's the memory location at which the CPU expects to find the first instruction to execute after reset. It contains a [jump](https://en.wikipedia.org/wiki/JMP_%28x86_instruction%29) (`jmp`) instruction that usually points to the [BIOS](https://en.wikipedia.org/wiki/BIOS) (Basic Input/Output System) entry point. For example, if we look in the [coreboot](https://www.coreboot.org/) source code (`src/cpu/x86/16bit/reset16.inc`), we see:
+We got `0xFFFFFFF0`, which is 16 bytes below 4GB. This is the very first address where the CPU starts the execution after reset. This address has special name - [reset vector](https://en.wikipedia.org/wiki/Reset_vector). It is the memory location at which the CPU expects to find the first instruction to execute after reset. Usually it contains a [jump](https://en.wikipedia.org/wiki/JMP_%28x86_instruction%29) (`jmp`) instruction which points to the [BIOS](https://en.wikipedia.org/wiki/BIOS) or [UEFI](https://en.wikipedia.org/wiki/UEFI) entry point. For example, if we take a look at the [source code](https://github.com/coreboot/coreboot/blob/main/src/cpu/x86/reset16.S) of the [coreboot](https://www.coreboot.org/), we will see it there:
+
```assembly
- .section ".reset", "ax", %progbits
- .code16
-.globl _start
+ /* This is the first instruction the CPU runs when coming out of reset. */
+.section ".reset", "ax", %progbits
+.globl _start
_start:
- .byte 0xe9
- .int _start16bit - ( . + 2 )
- ...
+ jmp _start16bit
```
-Here we can see the `jmp` instruction [opcode](http://ref.x86asm.net/coder32.html#xE9), which is `0xe9`, and its destination address at `_start16bit - ( . + 2)`.
+To prove that this code is located at the `0xFFFFFFF0` address, we may take a look at the [linker script](https://github.com/coreboot/coreboot/blob/master/src/arch/x86/bootblock.ld):
-We also see that the `reset` section is `16` bytes and is compiled to start from the address `0xfffffff0` (`src/cpu/x86/16bit/reset16.ld`):
-
-```
-SECTIONS {
- /* Trigger an error if I have an unusable start address */
- _bogus = ASSERT(_start16bit >= 0xffff0000, "_start16bit too low. Please report.");
- _ROMTOP = 0xfffffff0;
- . = _ROMTOP;
- .reset . : {
- *(.reset);
- . = 15;
- BYTE(0x00);
- }
-}
+
+```linker-script
+ . = 0xfffffff0;
+ _X86_RESET_VECTOR = .;
+ .reset . : {
+ *(.reset);
+ . = _X86_RESET_VECTOR_FILLING;
+ BYTE(0);
+ }
```
-Now the BIOS starts. After initializing and checking the hardware, the BIOS needs to find a bootable device. A boot order is stored in the BIOS configuration, controlling which devices the BIOS attempts to boot from. When attempting to boot from a hard drive, the BIOS tries to find a boot sector. On hard drives partitioned with an [MBR partition layout](https://en.wikipedia.org/wiki/Master_boot_record), the boot sector is stored in the first `446` bytes of the first sector, where each sector is `512` bytes. The final two bytes of the first sector are `0x55` and `0xaa`, which designates to the BIOS that this device is bootable. Once the BIOS finds the boot sector, it copies it into a fixed memory location at 0x7c00, jumps to there and start executing it.
+The address `0xFFFFFFF0` is much larger than `0xFFFFF` (1MB). How can the CPU access this address in real mode? The answer is simple. Most likely you have something more modern than **8086** CPU with 20-bit address bus. More modern processors starts in real mode but with 32-bit or 64-bit bus.
+
+When the CPU wakes up, it reads the jump at the `0xFFFFFFF0` address, jump into the firmware, and the long chain of the boot process begins. This is the very first step on the way to boot the Linux kernel.
+
+## From Power-On to Bootloader
+
+We stopped at the point when a CPU jumps from the reset vector to the firmware. On a legacy PC, that means the BIOS. On modern computers it is UEFI. In the next chapters we will see the booting processes on a legacy PC using the BIOS, and later UEFI.
-For example:
+The first job of BIOS is to bring the system into a working state. It runs a series of hardware checks and initializations — memory tests, peripheral setup, chipset configuration — all part of the [POST](https://en.wikipedia.org/wiki/Power-on_self-test) routine. Once everything is checked, the next step is to find an operating system to boot. The BIOS doesn’t pick just a random disk. It follows a boot order, a list stored in its configuration.
+
+When the BIOS tries to boot from a hard drive, it looks for a [boot sector](https://en.wikipedia.org/wiki/Boot_sector). On hard drives partitioned with an [MBR partition layout](https://en.wikipedia.org/wiki/Master_boot_record), the boot sector is stored in the first `446` bytes of the first sector, where each sector is `512` bytes. The final two bytes of the first sector must be `0x55` and `0xAA`. These two last bytes says to BIOS somewhat like "yes - this device is bootable". Once the BIOS finds the valid boot sector, it copies it into the fixed memory location at `0x7C00`, jumps to there and start executing it.
+
+In general, real mode's memory map is as follows:
+
+| Address Range | Description |
+|-----------------------|--------------------------------------|
+| 0x00000000–0x000003FF | Real Mode Interrupt Vector Table |
+| 0x00000400–0x000004FF | BIOS Data Area |
+| 0x00000500–0x00007BFF | Unused |
+| 0x00007C00–0x00007DFF | Bootloader |
+| 0x00007E00–0x0009FFFF | Unused |
+| 0x000A0000–0x000BFFFF | Video RAM (VRAM) Memory |
+| 0x000B0000–0x000B7777 | Monochrome Video Memory |
+| 0x000B8000–0x000BFFFF | Color Video Memory |
+| 0x000C0000–0x000C7FFF | Video ROM BIOS |
+| 0x000C8000–0x000EFFFF | BIOS Shadow Area |
+| 0x000F0000–0x000FFFFF | System BIOS |
+
+We can do a simple experiment and create a very primitive boot code:
```assembly
-;
-; Note: this example is written in Intel Assembly syntax
-;
+;;
+;; Note: this example is written using NASM assembler
+;;
[BITS 16]
boot:
+ ;; Symbol to print
mov al, '!'
+ ;; TTY-style text output
mov ah, 0x0e
+ ;; Position where to print the character
mov bh, 0x00
+ ;; Color
mov bl, 0x07
-
+ ;; Interrupt call
int 0x10
jmp $
@@ -128,455 +155,396 @@ db 0x55
db 0xaa
```
-Build and run this with:
+You can build and run this code using the following commands:
-```
-nasm -f bin boot.nasm && qemu-system-x86_64 boot
+```bash
+nasm -f bin boot.S && qemu-system-x86_64 boot -nographic
```
-This will instruct [QEMU](https://www.qemu.org/) to use the `boot` binary that we just built as a disk image. Since the binary generated by the assembly code above fulfills the requirements of the boot sector (we end it with the magic sequence), QEMU will treat the binary as the master boot record (MBR) of a disk image. Note that when providing a boot binary image to QEMU, setting the origin to 0x7c00 (using `[ORG 0x7c00]`)
-is unneeded.
+This will instruct [QEMU](https://www.qemu.org/) virtual machine to use the `boot` binary that we just built as a disk image. Since the binary generated by the assembly code above fulfills the requirements of the boot sector (we end it with the magic sequence), QEMU will treat the binary as the master boot record (MBR) of a disk image.
-You will see:
-
-
-
-In this example, we can see that the code will be executed in `16-bit` real mode. After starting, it calls the [0x10](http://www.ctyme.com/intr/rb-0106.htm) interrupt, which just prints the `!` symbol. The times directive will pad that number of bytes up to 510th byte with zeros and finishes with the two magic bytes `0xaa` and `0x55`.
-
-You can see a binary dump of this using the `objdump` utility:
+If you did everything correctly, you will see something like this after run of the command above:
```
-nasm -f bin boot.nasm
-objdump -D -b binary -mi386 -Maddr16,data16,intel boot
-```
-
-A real-world boot sector has code for continuing the boot process and a partition table instead of a bunch of 0's and an exclamation mark. :) From this point onwards, the BIOS hands control over to the bootloader.
+SeaBIOS (version 1.17.0-5.fc42)
-**NOTE**: As explained above, the CPU is in real mode. In real mode, calculating the physical address in memory is done as follows:
+iPXE (https://ipxe.org) 00:03.0 CA00 PCI2.10 PnP PMM+06FCAEC0+06F0AEC0 CA00
-```
-PhysicalAddress = Segment Selector * 16 + Offset
+Booting from Hard Disk...
+!
```
-just as explained above. We have only 16-bit general purpose registers, which has a maximum value of `0xffff`, so if we take the largest values the result will be:
+Of course, a real-world boot sector has "slightly" speaking more code for loading of an operating system instead of printing an exclamation mark, but it may interesting to experiment. In this example, we can see that the code will be executed in `16-bit` real mode which is specified by the `[BITS 16]` directive. After starting, it calls the [0x10](https://en.wikipedia.org/wiki/INT_10H) interrupt, which just prints the `!` symbol. The `times` directive will pad that number of bytes up to `510th` byte with zeros. In the end we "hard-code" the last two magic bytes `0xAA` and `0x55`. To exit from the virtual machine, you can press - `Ctrl+a x`.
-```python
->>> hex((0xffff * 16) + 0xffff)
-'0x10ffef'
-```
+From this point onwards, the BIOS hands control over to the bootloader.
-where `0x10ffef` is equal to `(1MB + 64KB - 16B) - 1`. An [8086](https://en.wikipedia.org/wiki/Intel_8086) processor (which was the first processor with real mode), in contrast, has a 20-bit address line. Since `2^20 = 1048576` is 1MB and `2^20 - 1` is the maximum address that could be used, this means that the actual available memory is 1MB.
+## The Bootloader Stage
-In general, real mode's memory map is as follows:
+There are a number of different bootloaders that can boot Linux kernel, such as [GRUB 2](https://www.gnu.org/software/grub/), [syslinux](http://www.syslinux.org/wiki/index.php/The_Syslinux_Project), [systemd-boot](https://www.freedesktop.org/wiki/Software/systemd/systemd-boot/), and others. The Linux kernel has a [Boot protocol](https://github.com/torvalds/linux/blob/master/Documentation/arch/x86/boot.rst) which specifies the requirements for a bootloader to implement Linux support. In this chapter, we will take a short look how GRUB 2 does loading.
-```
-0x00000000 - 0x000003FF - Real Mode Interrupt Vector Table
-0x00000400 - 0x000004FF - BIOS Data Area
-0x00000500 - 0x00007BFF - Unused
-0x00007C00 - 0x00007DFF - Our Bootloader
-0x00007E00 - 0x0009FFFF - Unused
-0x000A0000 - 0x000BFFFF - Video RAM (VRAM) Memory
-0x000B0000 - 0x000B7777 - Monochrome Video Memory
-0x000B8000 - 0x000BFFFF - Color Video Memory
-0x000C0000 - 0x000C7FFF - Video ROM BIOS
-0x000C8000 - 0x000EFFFF - BIOS Shadow Area
-0x000F0000 - 0x000FFFFF - System BIOS
-```
-
-At the beginning of this post, I wrote that the first instruction executed by the CPU is located at address `0xFFFFFFF0`, which is much larger than `0xFFFFF` (1MB). How can the CPU access this address in real mode? The answer is in the [coreboot](https://www.coreboot.org/Developer_Manual/Memory_map) documentation:
-
-```
-0xFFFE_0000 - 0xFFFF_FFFF: 128 kilobyte ROM mapped into address space
-```
+Continuing from where we left off - the BIOS has now selected a boot device, found its boot sector, loaded it into memory and passed control to the code located there. GRUB 2 bootloader consists of multiple [stages](https://www.gnu.org/software/grub/manual/grub/grub.html#Images). The first stage of the boot code is in the [boot.S](https://github.com/rhboot/grub2/blob/master/grub-core/boot/i386/pc/boot.S) source code file. Due to limited amount of space for the first boot sector, this code has only single goal - to load [core image](https://www.gnu.org/software/grub/manual/grub/html_node/Images.html) into memory and jump to it.
-At the start of execution, the BIOS is not in RAM, but in ROM.
+The core image starts with [diskboot.S](https://github.com/rhboot/grub2/blob/master/grub-core/boot/i386/pc/diskboot.S), which is usually stored right after the first sector of the disk. The code from the `diskboot.S` file loads the rest of the core image into memory. The core image contains the code of the loader itself and drivers for reading different filesystems. After the whole core image is loaded into memory, the execution continues from the [grub_main](https://github.com/rhboot/grub2/blob/master/grub-core/kern/main.c) function. This is where GRUB sets up the environment it needs to operate:
-Bootloader
---------------------------------------------------------------------------------
+- Initializes the console so messages and menus can be displayed.
+- Sets the root device — the disk from which GRUB will read files modules and configuration files.
+- Loads and parses the GRUB configuration file.
+- Loads required modules.
-There are a number of bootloaders that can boot Linux, such as [GRUB 2](https://www.gnu.org/software/grub/) and [syslinux](http://www.syslinux.org/wiki/index.php/The_Syslinux_Project). The Linux kernel has a [Boot protocol](https://github.com/torvalds/linux/blob/v4.16/Documentation/x86/boot.txt) which specifies the requirements for a bootloader to implement Linux support. This example will describe GRUB 2.
+Once these tasks are complete, we may see the familiar GRUB menu where we can choose the operating system we want to load. When we select one of the menu entries, GRUB executes the [boot](https://www.gnu.org/software/grub/manual/grub/grub.html#boot) command which boots the selected operating system. So how the loader loads the Linux kernel? To answer on this question, we need to get back to the Linux kernel boot protocol.
-Continuing from before, now that the BIOS has chosen a boot device and transferred control to the boot sector code, execution starts from [boot.img](http://git.savannah.gnu.org/gitweb/?p=grub.git;a=blob;f=grub-core/boot/i386/pc/boot.S;hb=HEAD). Its code is very simple, due to the limited amount of space available. It contains a pointer which is used to jump to the location of GRUB 2's core image. The core image begins with [diskboot.img](http://git.savannah.gnu.org/gitweb/?p=grub.git;a=blob;f=grub-core/boot/i386/pc/diskboot.S;hb=HEAD), which is usually stored immediately after the first sector in the unused space before the first partition. The above code loads the rest of the core image, which contains GRUB 2's kernel and drivers for handling filesystems, into memory. After loading the rest of the core image, it executes the [grub_main](http://git.savannah.gnu.org/gitweb/?p=grub.git;a=blob;f=grub-core/kern/main.c) function.
+As we can read in the [documentation](https://github.com/torvalds/linux/blob/master/Documentation/arch/x86/boot.rst), the bootloader must load the kernel into memory, fill some fields in the kernel setup header and pass control to the kernel code. The very first part of the kernel code is so-called kernel setup header and setup code. The kernel setup header is a special structure embedded in the early Linux boot code and provides fields that describes how kernel should be loaded and started. The setup header is started at the `0x01F1` offset from the beginning of the kernel image. We may look at the boot [linker script](https://github.com/torvalds/linux/blob/master/arch/x86/boot/setup.ld) to confirm the value of this offset:
-The `grub_main` function initializes the console, gets the base address for modules, sets the root device, loads/parses the grub configuration file, loads modules, etc. At the end of execution, the `grub_main` function moves grub to normal mode. The `grub_normal_execute` function (from the `grub-core/normal/main.c` source code file) completes the final preparations and shows a menu to select an operating system. When we select one of the grub menu entries, the `grub_menu_execute_entry` function runs, executing the grub `boot` command and booting the selected operating system.
+
+```linker-script
+ . = ASSERT(hdr == 0x1f1, "The setup header has the wrong offset!");
+```
-As we can read in the kernel boot protocol, the bootloader must read and fill some fields of the kernel setup header, which starts at offset `0x01f1` from the kernel setup code. You may look at the boot [linker script](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/setup.ld) to confirm the value of this offset. The kernel header [arch/x86/boot/header.S](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/header.S) starts from:
+The kernel [setup header](https://github.com/torvalds/linux/blob/master/arch/x86/boot/header.S) is split on two parts and the first part starts from the following fields:
+
```assembly
- .globl hdr
+ .globl hdr
hdr:
- setup_sects: .byte 0
- root_flags: .word ROOT_RDONLY
- syssize: .long 0
- ram_size: .word 0
- vid_mode: .word SVGA_MODE
- root_dev: .word 0
- boot_flag: .word 0xAA55
+ .byte setup_sects - 1
+root_flags: .word ROOT_RDONLY
+syssize: .long ZO__edata / 16
+ram_size: .word 0 /* Obsolete */
+vid_mode: .word SVGA_MODE
+root_dev: .word 0 /* Default to major/minor 0/0 */
+boot_flag: .word 0xAA55
```
-The bootloader must fill this and the rest of the headers (which are only marked as being type `write` in the Linux boot protocol, such as in [this example](https://github.com/torvalds/linux/blob/v4.16/Documentation/x86/boot.txt#L354)) with values either received from the command line or calculated during booting. (We will not go over full descriptions and explanations for all fields of the kernel setup header for now, but we shall do so when discussing how the kernel uses them. You can find a description of all fields in the [boot protocol](https://github.com/torvalds/linux/blob/v4.16/Documentation/x86/boot.txt#L156).)
+The bootloader may fill some of these fields in the setup header which marked as being type `write` or `modify` in the Linux boot protocol. The values set by the bootloader will be taken from its configuration or will be calculated during boot. Of course we will not go over full descriptions and explanations of all the fields of the kernel setup header. Instead, we will take a look closer at this or that field if we will meet it during our research of the kernel code.
-As we can see in the kernel boot protocol, memory will be mapped as follows after loading the kernel:
+According to the Linux kernel boot protocol, memory will be mapped as follows after loading the kernel:
-```shell
- | Protected-mode kernel |
-100000 +------------------------+
- | I/O memory hole |
-0A0000 +------------------------+
- | Reserved for BIOS | Leave as much as possible unused
- ~ ~
- | Command line | (Can also be below the X+10000 mark)
-X+10000 +------------------------+
- | Stack/heap | For use by the kernel real-mode code.
-X+08000 +------------------------+
- | Kernel setup | The kernel real-mode code.
- | Kernel boot sector | The kernel legacy boot sector.
- X +------------------------+
- | Boot loader | <- Boot sector entry point 0x7C00
-001000 +------------------------+
- | Reserved for MBR/BIOS |
-000800 +------------------------+
- | Typically used by MBR |
-000600 +------------------------+
- | BIOS use only |
-000000 +------------------------+
+```
+ ~ ~
+ | Protected-mode kernel |
+100000 +------------------------+
+ | I/O memory hole |
+0A0000 +------------------------+
+ | Reserved for BIOS | Leave as much as possible unused
+ ~ ~
+ | Command line | (Can also be below the X+10000 mark)
+X+10000 +------------------------+
+ | Stack/heap | For use by the kernel real-mode code.
+X+08000 +------------------------+
+ | Kernel setup | The kernel real-mode code.
+ | Kernel boot sector | The kernel legacy boot sector.
+X +------------------------+
+ | Boot loader | <- Boot sector entry point 0000:7C00
+001000 +------------------------+
+ | Reserved for MBR/BIOS |
+000800 +------------------------+
+ | Typically used by MBR |
+000600 +------------------------+
+ | BIOS use only |
+000000 +------------------------+
+... where the address X is as low as the design of the boot loader permits.
```
-When the bootloader transfers control to the kernel, it starts at:
+We can see that when the bootloader transfers control to the kernel, execution starts right after the kernel’s boot sector — that is, at the address `X` plus the length of the boot sector. The value of this `X` depends on how the kernel loaded. For example if I try to load kernel just with [qemu](https://www.qemu.org/), the starting address of the kernel image is at `0x10000`:
-```
-X + sizeof(KernelBootSector) + 1
+```bash
+hexdump -C /tmp/dump | grep MZ
+00010000 4d 5a 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |MZ..............|
```
-where `X` is the address of the kernel boot sector being loaded. In my case, `X` is `0x10000`, as we can see in a memory dump:
+Linux kernel image starts from `4D 5A` bytes as you may see in the beginning of the kernel setup code:
-
-
-How to get this memory dump in real mode?
---------------------------------------------------------------------------------
-```
-root@parallels-vm:/usr/src/linux# more arch/x86/kernel/vmlinux.lds
-...
-SECTIONS
-{
- . = (0xffffffff80000000 + ALIGN(0x1000000, 0x200000));
- phys_startup_64 = ABSOLUTE(startup_64 - 0xffffffff80000000);
- .text : AT(ADDR(.text) - 0xffffffff80000000) {
- _text = .;
- _stext = .;
-....
+
+```assembly
+ .code16
+ .section ".bstext", "ax"
+#ifdef CONFIG_EFI_STUB
+ # "MZ", MS-DOS header
+ .word IMAGE_DOS_SIGNATURE
```
-```
-root@parallels-vm:/usr/src/linux# nm vmlinux|grep startup_64
-0000000001000000 A phys_startup_64
-ffffffff81000030 T secondary_startup_64
-ffffffff810001f0 T __startup_64
-ffffffff81000000 T startup_64
-```
+If you want to get a similar memory dump, follow these steps. First of all, you need to build kernel. If you do not know how to do it, you can find detailed instruction [here](https://github.com/0xAX/linux-insides/blob/master/Misc/linux-misc-1.md). On the diagram above, we can see that the `Protected-mode` kernel starts from `0x100000`. Knowing this address we can start the kernel in the qemu virtual machine with the following command:
-Here we can see the memory address of the entry point, which is `0x0000000001000000`. Let's go ahead.
+```bash
+sudo qemu-system-x86_64 -kernel ./linux/arch/x86/boot/bzImage \
+ -nographic \
+ -append "console=ttyS0 nokaslr" \
+ -initrd /boot/initramfs-6.17.0-rc1-g8f5ae30d69d7.img -s -S
+```
-Before trying to debug the kernel, please see [Booting a Custom Linux Kernel in QEMU and Debugging It With GDB](http://nickdesaulniers.github.io/blog/2018/10/24/booting-a-custom-linux-kernel-in-qemu-and-debugging-it-with-gdb/)
+After the virtual machine is started, we can attach the debugger to it, set up a breakpoint on the entry point and get the dump:
-#### Step 1
-Booting in QEMU
-```
-qemu-system-x86_64 -kernel /usr/src/linux-4.14.207/arch/x86_64/boot/bzImage -nographic -append "console=ttyS0 nokaslr" -initrd /data/busybox/busybox-1.28.0/initramfs.cpio.gz -S -s
-```
-#### Step 2
-Attaching GDB to QEMU
-```
+```bash
gdb vmlinux
(gdb) target remote :1234
-(gdb) hbreak *0x0000000001000000
+(gdb) hbreak *0x100000
(gdb) c
+Continuing.
+
+Breakpoint 1, 0x0000000000100000 in ?? ()
(gdb) dump binary memory /tmp/dump 0x0000 0x20000
```
-#### Step 3
-```
-root@parallels-vm:/# hd /tmp/dump |grep -A 31 MZ
-00010000 4d 5a ea 07 00 c0 07 8c c8 8e d8 8e c0 8e d0 31 |MZ.............1|
-00010010 e4 fb fc be 40 00 ac 20 c0 74 09 b4 0e bb 07 00 |....@.. .t......|
-00010020 cd 10 eb f2 31 c0 cd 16 cd 19 ea f0 ff 00 f0 00 |....1...........|
-00010030 00 00 00 00 00 00 00 00 00 00 00 00 82 00 00 00 |................|
-00010040 55 73 65 20 61 20 62 6f 6f 74 20 6c 6f 61 64 65 |Use a boot loade|
-00010050 72 2e 0d 0a 0a 52 65 6d 6f 76 65 20 64 69 73 6b |r....Remove disk|
-00010060 20 61 6e 64 20 70 72 65 73 73 20 61 6e 79 20 6b | and press any k|
-00010070 65 79 20 74 6f 20 72 65 62 6f 6f 74 2e 2e 2e 0d |ey to reboot....|
-00010080 0a 00 50 45 00 00 64 86 04 00 00 00 00 00 00 00 |..PE..d.........|
-00010090 00 00 01 00 00 00 a0 00 06 02 0b 02 02 14 20 d5 |.............. .|
-000100a0 80 00 00 00 00 00 e0 b8 79 01 80 46 00 00 00 02 |........y..F....|
-000100b0 00 00 00 00 00 00 00 00 00 00 20 00 00 00 20 00 |.......... ... .|
-000100c0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
-000100d0 00 00 00 90 fa 01 00 02 00 00 00 00 00 00 0a 00 |................|
-000100e0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
-*
-00010100 00 00 00 00 00 00 06 00 00 00 00 00 00 00 00 00 |................|
-00010110 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
-*
-00010130 00 00 00 00 00 00 00 00 00 00 2e 73 65 74 75 70 |...........setup|
-00010140 00 00 e0 41 00 00 00 02 00 00 e0 41 00 00 00 02 |...A.......A....|
-00010150 00 00 00 00 00 00 00 00 00 00 00 00 00 00 20 00 |.............. .|
-00010160 50 60 2e 72 65 6c 6f 63 00 00 20 00 00 00 e0 43 |P`.reloc.. ....C|
-00010170 00 00 20 00 00 00 e0 43 00 00 00 00 00 00 00 00 |.. ....C........|
-00010180 00 00 00 00 00 00 40 00 10 42 2e 74 65 78 74 00 |......@..B.text.|
-00010190 00 00 20 93 80 00 00 44 00 00 20 93 80 00 00 44 |.. ....D.. ....D|
-000101a0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 20 00 |.............. .|
-000101b0 50 60 2e 62 73 73 00 00 00 00 e0 b8 79 01 20 d7 |P`.bss......y. .|
-000101c0 80 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
-000101d0 00 00 00 00 00 00 80 00 00 c8 00 00 00 00 00 00 |................|
-000101e0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ff |................|
-000101f0 ff 21 01 00 32 09 08 00 00 00 ff ff 00 00 55 aa |.!..2.........U.|
-```
-The bootloader has now loaded the Linux kernel into memory, filled the header fields, and then jumped to the corresponding memory address. We now move directly to the kernel setup code.
+After this you should be able to find your dump in the `/tmp/dump`.
+
+If we try to load Linux kernel using GRUB 2 bootloader, this `X` address will be `0x90000`. Let's take a look how to do it and check. First of all you need to prepare image with kernel and GRUB 2. To do so execute the following commands:
+
+```bash
+qemu-img create hdd.img 64M
+parted hdd.img --script mklabel msdos
+parted hdd.img --script mkpart primary ext2 1MiB 100%
+parted hdd.img --script set 1 boot on
+sudo losetup -fP hdd.img
+sudo mkfs.ext2 /dev/loop0p1
+sudo mount /dev/loop0p1 /mnt/tmp
+sudo mkdir -p /mnt/tmp/boot/grub
+sudo grub2-install \
+ --target=i386-pc \
+ --boot-directory=/mnt/tmp/boot \
+ /dev/loop0
+sudo cp ./arch/x86/boot/bzImage /mnt/tmp/boot/
+sudo tee /mnt/tmp/boot/grub/grub.cfg > /dev/null <
```assembly
- .globl _start
_start:
- .byte 0xeb
- .byte start_of_setup-1f
+ # Explicitly enter this as bytes, or the assembler
+ # tries to generate a 3-byte jump here, which causes
+ # everything else to push off to the wrong offset.
+ .byte 0xeb # short (2-byte) jump
+ .byte start_of_setup-1f
1:
- //
- // rest of the header
- //
```
-Here we can see a `jmp` instruction opcode (`0xeb`) that jumps to the `start_of_setup-1f` point. In `Nf` notation, `2f`, for example, refers to the local label `2:`. In our case, it's label `1:` that is present right after the jump, and contains the rest of the setup [header](https://github.com/torvalds/linux/blob/v4.16/Documentation/x86/boot.txt#L156). Right after the setup header, we see the `.entrytext` section, which starts at the `start_of_setup` label.
-
-This is the first code that actually runs (aside from the previous jump instructions, of course). After the kernel setup part receives control from the bootloader, the first `jmp` instruction is located at the `0x200` offset from the start of the kernel real mode, i.e., after the first 512 bytes. This can be seen in both the Linux kernel boot protocol and the GRUB 2 source code:
+The very first instruction we encounter here is the jump specified by the `0xEB` opcode. The second byte is the distance where to jump. If you’ve never met the `Nf` syntax before, `1f` means the next label `1` that will appear in the code. And immediately after those two bytes is the label `1` which is located right before the beginning of the second part of the kernel setup header. Right after the second part of the setup header, we see the `.entrytext` section, which starts at the `start_of_setup` label. This is exactly the place where the execution will be continued. But from where we are jumping? After the kernel setup code receives control from the bootloader, the first `jmp` instruction is located at the `0x200` bytes offset from the start of the loaded kernel image. This can be seen in both the Linux kernel boot protocol and the GRUB 2 [source code](https://github.com/rhboot/grub2/blob/master/grub-core/loader/i386/pc/linux.c):
```C
segment = grub_linux_real_target >> 4;
state.gs = state.fs = state.es = state.ds = state.ss = segment;
state.cs = segment + 0x20;
+state.ip = 0;
```
-In my case, the kernel is loaded at the physical address `0x10000`. This means that segment registers have the following values after kernel setup starts:
+Here, `grub_linux_real_target` is the physical load address of the setup code. As we have seen in the previous section, this address is usually `0x90000`. Shifting it right by four divides it by `16`, converting a physical address into a segment value - that’s how real mode memory segmentation works. Then GRUB adds `0x20` to `cs` before starting execution. Why `0x20`? Let's remember that in real mode, physical addresses are computed as:
```
-gs = fs = es = ds = ss = 0x1000
-cs = 0x1020
+Physical = (cs << 4) + ip
```
-After the jump to `start_of_setup`, the kernel needs to do the following:
+With `ip = 0` and `cs` increased by `0x20`, the offset from the start of the loaded image is:
-* Make sure that all segment register values are equal
-* Set up a correct stack, if needed
-* Set up [bss](https://en.wikipedia.org/wiki/.bss)
-* Jump to the C code in [arch/x86/boot/main.c](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/main.c)
+```
+0x20 << 4 = 0x200
+```
-Let's look at the implementation.
+This is 512 bytes — exactly the offset where our jump instruction resides in the image.
-Aligning the Segment Registers
---------------------------------------------------------------------------------
+After the jump to the `start_of_setup` label, the kernel setup code enters the very first phase of its real work:
-First of all, the kernel ensures that the `ds` and `es` segment registers point to the same address. Next, it clears the direction flag using the `cld` instruction:
+- Unifying the segment registers
+- Establishing a valid stack
+- Clearing the `.bss` section
+- Transitioning into C code
-```assembly
- movw %ds, %ax
- movw %ax, %es
- cld
-```
+In the next sections, we’ll walk through each of these steps in detail.
-As I wrote earlier, `grub2` loads kernel setup code at address `0x10000` by default and `cs` at `0x1020` because execution doesn't start from the start of the file, but from the jump here:
+### Aligning the segment registers
-```assembly
-_start:
- .byte 0xeb
- .byte start_of_setup-1f
-```
-
-which is at a `512` byte offset from [4d 5a](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/header.S#L46). We also need to align `cs` from `0x1020` to `0x1000`, as well as all other segment registers. After that, we set up the stack:
+First of all, the kernel setup code ensures that the `ds` and `es` segment registers point to the same address. Next, it clears the [direction flag](https://en.wikipedia.org/wiki/Direction_flag) using the `cld` instruction:
+
```assembly
- pushw %ds
- pushw $6f
- lretw
+ .section ".entrytext", "ax"
+start_of_setup:
+# Force %es = %ds
+ movw %ds, %ax
+ movw %ax, %es
+ cld
```
-which pushes the value of `ds` to the stack, followed by the address of the [6](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/header.S#L602) label and executes the `lretw` instruction. When the `lretw` instruction is called, it loads the address of label `6` into the [instruction pointer](https://en.wikipedia.org/wiki/Program_counter) register and loads `cs` with the value of `ds`. Afterward, `ds` and `cs` will have the same values.
+We need to do both of these two things to clear the [bss](https://en.wikipedia.org/wiki/.bss) section properly a bit later. From this point we are sure that both `ds` and `es` segment registers point to the same address - `0x9000`.
-Stack Setup
---------------------------------------------------------------------------------
+### Stack Setup
-Almost all of the setup code is for preparing the C language environment in real mode. The next [step](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/header.S#L575) is checking the `ss` register's value and setting up a correct stack if `ss` is wrong:
+We need to prepare for C language environment. The next step is to setup the stack. Let's take a look at the next lines of the code:
+
```assembly
- movw %ss, %dx
- cmpw %ax, %dx
- movw %sp, %dx
- je 2f
+ movw %ss, %dx
+ cmpw %ax, %dx # %ds == %ss?
+ movw %sp, %dx
+ je 2f # -> assume %sp is reasonably set
```
-This can lead to 3 different scenarios:
-
-* `ss` has a valid value `0x1000` (as do all the other segment registers besides `cs`)
-* `ss` is invalid and the `CAN_USE_HEAP` flag is set (see below)
-* `ss` is invalid and the `CAN_USE_HEAP` flag is not set (see below)
-
-Let's look at all three of these scenarios in turn:
-
-* `ss` has a correct address (`0x1000`). In this case, we go to label [2](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/header.S#L589):
+Here we compare the value of the `ss` and `ds` registers. According to the comment around this code, only old versions of the [LILO](https://en.wikipedia.org/wiki/LILO_(bootloader)) bootloader may set these registers to different values. So we will skip all the "edge cases" and consider only single case when the value of the `ss` register equal to `ds`. Since the values of these registers are equal, we jump to the `2` label:
+
```assembly
-2: andw $~3, %dx
- jnz 3f
- movw $0xfffc, %dx
-3: movw %ax, %ss
- movzwl %dx, %esp
- sti
+2: # Now %dx should point to the end of our stack space
+ andw $~3, %dx # dword align (might as well...)
+ jnz 3f
+ movw $0xfffc, %dx # Make sure we're not zero
+3: movw %ax, %ss
+ movzwl %dx, %esp # Clear upper half of %esp
+ sti # Now we should have a working stack
```
-Here we set the alignment of `dx` (which contains the value of `sp` as given by the bootloader) to `4` bytes and check if it is zero. If it is, we set `dx` to `0xfffc` (The last 4-byte aligned address in a 64KB segment). If it is not zero, we continue to use the value of `sp` given by the bootloader (`0xf7f4` in my case). Afterwards, we put the value of `ax` (`0x1000`) into `ss`. We now have a correct stack:
-
-
-
-* The second scenario, (`ss` != `ds`). First, we put the value of [_end](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/setup.ld) (the address of the end of the setup code) into `dx` and check the `loadflags` header field using the `testb` instruction to see whether we can use the heap. [loadflags](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/header.S#L320) is a bitmask header defined as:
+`dx` register stores stack pointer value whish should point to the top of the stack. The value of the stack pointer is `0x9000`. GRUB 2 bootloader sets it during loading of the Linux kernel image and the address is defined by the:
+
```C
-#define LOADED_HIGH (1<<0)
-#define QUIET_FLAG (1<<5)
-#define KEEP_SEGMENTS (1<<6)
-#define CAN_USE_HEAP (1<<7)
+#define GRUB_LINUX_SETUP_STACK 0x9000
```
-and as we can read in the boot protocol:
+At the next step we check that the address is aligned by four bytes and if yes jump to the label `3`. If the stack pointer is not aligned, we set it to `0xFFFC` value. The reason for this that we can not have stack pointer equal to zero as it grows down during pushing something on the stack. The `0xFFFC` value is the highest 4‑byte aligned address below `0x10000`. If the value of the stack pointer is aligned, we continue to use the aligned value.
-```
-Field name: loadflags
+From this point we have a correct stack and starts from `0x9000:0x9000` and grows down:
- This field is a bitmask.
+
- Bit 7 (write): CAN_USE_HEAP
- Set this bit to 1 to indicate that the value entered in the
- heap_end_ptr is valid. If this field is clear, some setup code
- functionality will be disabled.
-```
+### BSS Setup
-If the `CAN_USE_HEAP` bit is set, we put `heap_end_ptr` into `dx` (which points to `_end`) and add `STACK_SIZE` (the minimum stack size, `1024` bytes) to it. After this, if `dx` is not carried (it will not be carried, `dx = _end + 1024`), jump to label `2` (as in the previous case) and make a correct stack.
+Before the kernel can switch to C code, two final tasks must be done:
-
+- Verify the “magic” signature.
+- Clear the `.bss` section.
-* When `CAN_USE_HEAP` is not set, we just use a minimal stack from `_end` to `_end + STACK_SIZE`:
+The first is the signature checking:
-
+
+```assembly
+ cmpl $0x5a5aaa55, setup_sig
+ jne setup_bad
+```
-BSS Setup
---------------------------------------------------------------------------------
+This simply compares the [setup_sig](https://github.com/torvalds/linux/blob/master/arch/x86/boot/setup.ld) constant value placed by the linker with the magic number `0x5A5AAA55`. If they are not equal, the setup code reports a fatal error and stops execution. The main goal of this check is to ensure we are actually running a valid Linux kernel setup binary, loaded into the proper place by the bootloader.
-The last two steps that need to happen before we can jump to the main C code are setting up the [BSS](https://en.wikipedia.org/wiki/.bss) area and checking the "magic" signature. First, signature checking:
+With the magic number confirmed, and knowing our segment registers and stack are already in the proper state, the only initialization left is to clear the `.bss` section. The section of memory is used to store statically allocated, uninitialized data. Let's take a look at the initialization of this memory area:
+
```assembly
- cmpl $0x5a5aaa55, setup_sig
- jne setup_bad
+ movw $__bss_start, %di
+ movw $_end+3, %cx
+ xorl %eax, %eax
+ subw %di, %cx
+ shrw $2, %cx
+ rep stosl
```
-This simply compares the [setup_sig](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/setup.ld) with the magic number `0x5a5aaa55`. If they are not equal, a fatal error is reported.
+The main goal of this code is to clear or in other words to fill with zeros the memory area between `__bss_start` and `_end`. To fill this memory area with zeros, the `rep stos` instruction is used. This instruction puts the value of the `eax` register to the destination pointed by the `es:di`. That is why we unified the values of the `ds` and `es` registers. The `rep` prefix specifies the repetition of the `stos` instruction based on the value of the `cx` register.
-If the magic number matches, knowing we have a set of correct segment registers and a stack, we only need to set up the BSS section before jumping into the C code.
+To clear this memory area, at first we set the borders of this area - from the [__bss_start](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/setup.ld) to `_end + 3`. We add `3` bytes to the `_end` address because we are going to write zeros in double words or 4 bytes at a time). Adding three bytes ensures that when we later divide by four, any reminder at the end of the memory area still get covered. After we setup the borders of the memory area and fill the `eax` with 0 using the `xor` instruction, the `rep stosl` does its job.
-The BSS section is used to store statically allocated, uninitialized data. Linux carefully ensures this area of memory is first zeroed using the following code:
+The effect of this code is that zeros are written through the all memory from `__bss_start` to `_end`. To know exact addresses of them we can inspect `setup.elf` file with [readelf](https://en.wikipedia.org/wiki/Readelf) utility:
-```assembly
- movw $__bss_start, %di
- movw $_end+3, %cx
- xorl %eax, %eax
- subw %di, %cx
- shrw $2, %cx
- rep; stosl
+```bash
+$ readelf -a arch/x86/boot/setup.elf | grep bss
+ [12] .bss NOBITS 00003f00 004efc 001380 00 WA 0 0 32
+ 00 .bstext .header .entrytext .inittext .initdata .text .text32 .rodata .videocards .data .signature .bss
+ 145: 00005280 0 NOTYPE GLOBAL DEFAULT 12 __bss_end
+ 169: 00003f00 0 NOTYPE GLOBAL DEFAULT 12 __bss_start
```
-First, the [__bss_start](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/setup.ld) address is moved into `di`. Next, the `_end + 3` address (+3 - aligns to 4 bytes) is moved into `cx`. The `eax` register is cleared (using the `xor` instruction), and the bss section size (`cx - di`) is calculated and put into `cx`. Then, `cx` is divided by four (the size of a 'word'), and the `stosl` instruction is used repeatedly, storing the value of `eax` (zero) into the address pointed to by `di`, automatically increasing `di` by four, repeating until `cx` reaches zero. The net effect of this code is that zeros are written through all words in memory from `__bss_start` to `_end`:
+These offsets inside the setup segment. Since in our case the kernel image is loaded at physical address `0x90000`, the symbols translate to:
+
+- __bss_start = 0x90000 + 0x3f00 = 0x93F00
+- __bss_end = 0x90000 + 0x5280 = 0x95280
-
+The following diagram illustrates how the setup image, `.bss`, and the stack region are laid out in memory:
-Jump to main
---------------------------------------------------------------------------------
+
-That's all! We have the stack and BSS, so we can jump to the `main()` C function:
+### Jump to C code
+At this point we have initialized the [stack](#stack-setup) and [.bss](#bss-setup) sections. The last instruction of the early kernel setup assembly is to jump to C code:
+
+
```assembly
- calll main
+ calll main
```
-The `main()` function is located in [arch/x86/boot/main.c](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/main.c). You can read about what this does in the next part.
+The `main()` function is located in [arch/x86/boot/main.c](https://github.com/torvalds/linux/blob/master/arch/x86/boot/main.c) source code file.
+
+What's happening there, we will see in the next chapter.
-Conclusion
---------------------------------------------------------------------------------
+## Conclusion
-This is the end of the first part about Linux kernel insides. If you have questions or suggestions, ping me on Twitter [0xAX](https://twitter.com/0xAX), drop me an [email](mailto:anotherworldofworld@gmail.com), or just create an [issue](https://github.com/0xAX/linux-internals/issues/new). In the next part, we will see the first C code that executes in the Linux kernel setup, the implementation of memory routines such as `memset`, `memcpy`, `earlyprintk`, early console implementation and initialization, and much more.
+This is the end of the first part about Linux kernel insides. If you have questions or suggestions, feel free ping me on X - [0xAX](https://twitter.com/0xAX), drop me an [email](mailto:anotherworldofworld@gmail.com), or just create an [issue](https://github.com/0xAX/linux-insides/issues/new). In the next part, we will see the first C code that executes in the Linux kernel setup, the implementation of memory routines such as `memset`, `memcpy`, `earlyprintk`, early console implementation and initialization, and much more.
-**Please note that English is not my first language and I am really sorry for any inconvenience. If you find any mistakes please send me PR to [linux-insides](https://github.com/0xAX/linux-internals).**
+## Links
-Links
---------------------------------------------------------------------------------
+Here is the list of the links that you may find useful during reading of this chapter:
- * [Intel 80386 programmer's reference manual 1986](http://css.csail.mit.edu/6.858/2014/readings/i386.pdf)
- * [Minimal Boot Loader for Intel® Architecture](https://www.cs.cmu.edu/~410/doc/minimal_boot.pdf)
- * [Minimal Boot Loader in Assembler with comments](https://github.com/Stefan20162016/linux-insides-code/blob/master/bootloader.asm)
- * [8086](https://en.wikipedia.org/wiki/Intel_8086)
- * [80386](https://en.wikipedia.org/wiki/Intel_80386)
- * [Reset vector](https://en.wikipedia.org/wiki/Reset_vector)
- * [Real mode](https://en.wikipedia.org/wiki/Real_mode)
- * [Linux kernel boot protocol](https://www.kernel.org/doc/Documentation/x86/boot.txt)
- * [coreboot developer manual](https://www.coreboot.org/Developer_Manual)
- * [Ralf Brown's Interrupt List](http://www.ctyme.com/intr/int.htm)
- * [Power supply](https://en.wikipedia.org/wiki/Power_supply)
- * [Power good signal](https://en.wikipedia.org/wiki/Power_good_signal)
+- [Intel 80386 programmer's reference manual 1986](http://css.csail.mit.edu/6.858/2014/readings/i386.pdf)
+- [Minimal Boot Loader for Intel® Architecture](https://www.cs.cmu.edu/~410/doc/minimal_boot.pdf)
+- [Minimal Boot Loader in Assembler with comments](https://github.com/Stefan20162016/linux-insides-code/blob/master/bootloader.asm)
+- [8086](https://en.wikipedia.org/wiki/Intel_8086)
+- [80386](https://en.wikipedia.org/wiki/Intel_80386)
+- [Reset vector](https://en.wikipedia.org/wiki/Reset_vector)
+- [Real mode](https://en.wikipedia.org/wiki/Real_mode)
+- [Linux kernel boot protocol](https://www.kernel.org/doc/Documentation/x86/boot.rst)
+- [Ralf Brown's Interrupt List](http://www.ctyme.com/intr/int.htm)
+- [Power supply](https://en.wikipedia.org/wiki/Power_supply)
+- [Power good signal](https://en.wikipedia.org/wiki/Power_good_signal)
diff --git a/Booting/linux-bootstrap-2.md b/Booting/linux-bootstrap-2.md
index 508d6d33..c3454d44 100644
--- a/Booting/linux-bootstrap-2.md
+++ b/Booting/linux-bootstrap-2.md
@@ -1,418 +1,363 @@
-Kernel booting process. Part 2.
-================================================================================
+# Kernel booting process - Part 2.
-First steps in the kernel setup
---------------------------------------------------------------------------------
+We have already started our journey into the Linux kernel in the previous [part](./linux-bootstrap-1.md), where we have walked through the very early stages of the booting process and first assembly instructions of the Linux kernel code. Aside from different mechanisms, this code was responsible to prepare environment for [C](https://en.wikipedia.org/wiki/C_(programming_language)) programming language. At the end of chapter we reached a symbolic milestone - the very first call of a C function. This function has classical name - `main` and defined in the [arch/x86/boot/main.c](https://github.com/torvalds/linux/blob/master/arch/x86/boot/main.c) source code file.
-We started to dive into the linux kernel's insides in the previous [part](linux-bootstrap-1.md) and saw the initial part of the kernel setup code. We stopped at the first call to the `main` function (which is the first function written in C) from [arch/x86/boot/main.c](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/main.c).
+From here on, we will start to see assembler code more and more rare, but it is not the end 🤓 We still will meet some assembly code on our way, but it will be more rare and rare. But now it is time for more "high level" logic!
-In this part, we will continue to research the kernel setup code and go over
-* what `protected mode` is,
-* the transition into it,
-* the initialization of the heap and the console,
-* memory detection, CPU validation and keyboard initialization
-* and much much more.
+In this part, we’ll keep digging through the kernel’s setup code and cover:
-So, let's go ahead.
+- What [protected mode](https://en.wikipedia.org/wiki/Protected_mode) is on x86 processors
+- Setup of early [heap](https://en.wikipedia.org/wiki/Memory_management#HEAP) and console
+- Detection of available memory
+- Validation of a CPU
+- Initialization of a keyboard
-Protected mode
---------------------------------------------------------------------------------
+Time to explore these steps in detail!
-Before we can move to the native Intel64 [Long Mode](http://en.wikipedia.org/wiki/Long_mode), the kernel must switch the CPU into protected mode.
+## Protected mode
-What is [protected mode](https://en.wikipedia.org/wiki/Protected_mode)? Protected mode was first added to the x86 architecture in 1982 and was the main mode of Intel processors from the [80286](http://en.wikipedia.org/wiki/Intel_80286) processor until Intel 64 and long mode came.
+The Linux kernel for x86_64 operates in a special mode called - [long mode](http://en.wikipedia.org/wiki/Long_mode). One of the main goal of all the setup kernel code is to switch to this mode. But before we can move to this mode, the kernel must switch the CPU into [protected mode](https://en.wikipedia.org/wiki/Protected_mode).
-The main reason to move away from [Real mode](http://wiki.osdev.org/Real_Mode) is that there is very limited access to the RAM. As you may remember from the previous part, there are only 220 bytes or 1 Megabyte, sometimes even only 640 Kilobytes of RAM available in Real mode.
+What is [protected mode](https://en.wikipedia.org/wiki/Protected_mode)? From the previous chapter we already know that currently CPU operates in [real mode](https://en.wikipedia.org/wiki/Real_mode). For us it is mostly means - memory segmentation. As a short reminder - to access a memory location, the combination of two CPU [registers](https://en.wikipedia.org/wiki/Processor_register) is used:
-Protected mode brought many changes, but the main one is the difference in memory management. The 20-bit address bus was replaced with a 32-bit address bus. It allowed access to 4 Gigabytes of memory vs the 1 Megabyte in Real mode. Also, [paging](http://en.wikipedia.org/wiki/Paging) support was added, which you can read about in the next sections.
+- A segment register - `cs`, `ds`, `ss` and `es` which defines segment selector.
+- A general purpose register which specifies offset within the segment.
-Memory management in Protected mode is divided into two, almost independent parts:
+The main motivation for switching from real mode is its memory limitation. As we saw in the previous part, real mode can address only 220 bytes. This is just 1 MB of RAM. Obviously modern software including an operating system kernel need more. To break this constraints, the new processor mode was introduced - protected mode.
-* Segmentation
-* Paging
+Protected mode was introduced to the x86 architecture in 1982 and became the primary operating mode of Intel processors, starting with the [80286](http://en.wikipedia.org/wiki/Intel_80286) until the introduction of x86_64 and long mode. This mode brought many changes and improvements, but one of the most crucial was in memory management. The 20-bit address bus was replaced with a 32-bit address bus. It allowed access to 4 Gigabytes of memory vs the 1 Megabyte in real mode.
-Here we will only talk about segmentation. Paging will be discussed in the next sections.
+Memory management in protected mode is divided into two, mostly independent mechanisms:
-As you can read in the previous part, addresses consist of two parts in Real mode:
+- `Segmentation`
+- `Paging`
-* Base address of the segment
-* Offset from the segment base
+For now, our attention stays on segmentation. We’ll return to paging later, once we enter 64-bit long mode.
-And we can get the physical address if we know these two parts by:
+### Memory segmentation in protected mode
-```
-PhysicalAddress = Segment Base * 16 + Offset
-```
+In protected mode, memory segmentation was completely redesigned. Fixed 64 KB real mode segments are gone. Instead, each segment is now defined by a special data structure called a `Segment Descriptor` which specifies the properties of a memory segment. The segment descriptors are stored in another special structure called `Global Descriptor Table` or `GDT`. Whenever a CPU needs to find an actual physical memory address, it consults this table. The GDT itself is just a block of memory which address is stored in the special CPU register called `gdtr`. This is a 48-bit register and consists of two parts:
-Memory segmentation was completely redone in protected mode. There are no 64 Kilobyte fixed-size segments. Instead, the size and location of each segment is described by an associated data structure called the _Segment Descriptor_. These segment descriptors are stored in a data structure called the `Global Descriptor Table` (GDT).
+- The size of the Global Descriptor Table
+- The address of the Global Descriptor Table
-The GDT is a structure which resides in memory. It has no fixed place in the memory, so its address is stored in the special `GDTR` register. Later we will see how the GDT is loaded in the Linux kernel code. There will be an operation for loading it from memory, something like:
+Later, we will see exactly how the Linux kernel builds and loads its GDT. For now, it’s enough to know that the CPU provides a dedicated instruction to load the table’s address into the GDTR register:
```assembly
lgdt gdt
```
-where the `lgdt` instruction loads the base address and limit(size) of the global descriptor table to the `GDTR` register. `GDTR` is a 48-bit register and consists of two parts:
-
- * the size(16-bit) of the global descriptor table;
- * the address(32-bit) of the global descriptor table.
-
-As mentioned above, the GDT contains `segment descriptors` which describe memory segments. Each descriptor is 64-bits in size. The general scheme of a descriptor is:
-
-```
- 63 56 51 48 45 39 32
-------------------------------------------------------------
-| | |B| |A| | | | |0|E|W|A| |
-| BASE 31:24 |G|/|L|V| LIMIT |P|DPL|S| TYPE | BASE 23:16 |
-| | |D| |L| 19:16 | | | |1|C|R|A| |
-------------------------------------------------------------
-
- 31 16 15 0
-------------------------------------------------------------
-| | |
-| BASE 15:0 | LIMIT 15:0 |
-| | |
-------------------------------------------------------------
-```
-
-Don't worry, I know it looks a little scary after Real mode, but it's easy. For example LIMIT 15:0 means that bits 0-15 of the segment limit are located at the beginning of the Descriptor. The rest of it is in LIMIT 19:16, which is located at bits 48-51 of the Descriptor. So, the size of Limit is 0-19 i.e 20-bits. Let's take a closer look at it:
-
-1. Limit[20-bits] is split between bits 0-15 and 48-51. It defines the `length_of_segment - 1`. It depends on the `G`(Granularity) bit.
+As mentioned above, the GDT contains `segment descriptors` which describe memory segments. Now let's see how segment descriptors look like. Each descriptor is 64-bits in size. The general scheme of a descriptor is:
- * if `G` (bit 55) is 0 and the segment limit is 0, the size of the segment is 1 Byte
- * if `G` is 1 and the segment limit is 0, the size of the segment is 4096 Bytes
- * if `G` is 0 and the segment limit is 0xfffff, the size of the segment is 1 Megabyte
- * if `G` is 1 and the segment limit is 0xfffff, the size of the segment is 4 Gigabytes
+
- So, what this means is
- * if G is 0, Limit is interpreted in terms of 1 Byte and the maximum size of the segment can be 1 Megabyte.
- * if G is 1, Limit is interpreted in terms of 4096 Bytes = 4 KBytes = 1 Page and the maximum size of the segment can be 4 Gigabytes. Actually, when G is 1, the value of Limit is shifted to the left by 12 bits. So, 20 bits + 12 bits = 32 bits and 232 = 4 Gigabytes.
+Do not worry! I know it may look a little bit intimidating at the first glance, especially in comparison to the relatively simple addressing in real mode, but we will go through it in details. We will start from the bottom, from right to left.
-2. Base[32-bits] is split between bits 16-31, 32-39 and 56-63. It defines the physical address of the segment's starting location.
+The first field is `LIMIT 15:0`. It represents the first 16 bits of the segment limit. The second part is located at the bits `51:48`. This field provides information about the size of a segment. Having 20-bit size of the limit field, it may seem that the max size of a memory segment can be 1 MB, but it is not like that. In addition, the max size of a segment depends on the 55th `G` bit:
-3. Type/Attribute[5-bits] is represented by bits 40-44. It defines the type of segment and how it can be accessed.
- * The `S` flag at bit 44 specifies the descriptor type. If `S` is 0 then this segment is a system segment, whereas if `S` is 1 then this is a code or data segment (Stack segments are data segments which must be read/write segments).
+- If `G=0` - the value of the `LIMIT` field is interpreted in bytes.
+- if `G=1` - the value of the `LIMIT` field is interpreted in 4 KB units called pages.
-To determine if the segment is a code or data segment, we can check its Ex(bit 43) Attribute (marked as 0 in the above diagram). If it is 0, then the segment is a Data segment, otherwise, it is a code segment.
+Based on this, we can easily calculate that the max size of a segment is 4 GB.
-A segment can be of one of the following types:
+The next field is `BASE`. We may see that it is split on three parts. The first part occupies bits from 16 to 31, the second part occupies bits from 32 to 39, and the last third part occupies bits from 56 to 63. The main goal of this field is to store the base address of a segment.
-```
---------------------------------------------------------------------------------------
-| Type Field | Descriptor Type | Description |
-|-----------------------------|-----------------|------------------------------------|
-| Decimal | | |
-| 0 E W A | | |
-| 0 0 0 0 0 | Data | Read-Only |
-| 1 0 0 0 1 | Data | Read-Only, accessed |
-| 2 0 0 1 0 | Data | Read/Write |
-| 3 0 0 1 1 | Data | Read/Write, accessed |
-| 4 0 1 0 0 | Data | Read-Only, expand-down |
-| 5 0 1 0 1 | Data | Read-Only, expand-down, accessed |
-| 6 0 1 1 0 | Data | Read/Write, expand-down |
-| 7 0 1 1 1 | Data | Read/Write, expand-down, accessed |
-| C R A | | |
-| 8 1 0 0 0 | Code | Execute-Only |
-| 9 1 0 0 1 | Code | Execute-Only, accessed |
-| 10 1 0 1 0 | Code | Execute/Read |
-| 11 1 0 1 1 | Code | Execute/Read, accessed |
-| 12 1 1 0 0 | Code | Execute-Only, conforming |
-| 14 1 1 0 1 | Code | Execute-Only, conforming, accessed |
-| 13 1 1 1 0 | Code | Execute/Read, conforming |
-| 15 1 1 1 1 | Code | Execute/Read, conforming, accessed |
---------------------------------------------------------------------------------------
-```
-
-As we can see the first bit(bit 43) is `0` for a _data_ segment and `1` for a _code_ segment. The next three bits (40, 41, 42) are either `EWA`(*E*xpansion *W*ritable *A*ccessible) or CRA(*C*onforming *R*eadable *A*ccessible).
- * if E(bit 42) is 0, expand up, otherwise, expand down. Read more [here](http://www.sudleyplace.com/dpmione/expanddown.html).
- * if W(bit 41)(for Data Segments) is 1, write access is allowed, and if it is 0, the segment is read-only. Note that read access is always allowed on data segments.
- * A(bit 40) controls whether the segment can be accessed by the processor or not.
- * C(bit 43) is the conforming bit(for code selectors). If C is 1, the segment code can be executed from a lower level privilege (e.g. user) level. If C is 0, it can only be executed from the same privilege level.
- * R(bit 41) controls read access to code segments; when it is 1, the segment can be read from. Write access is never granted for code segments.
-
-4. DPL[2-bits] (Descriptor Privilege Level) comprises the bits 45-46. It defines the privilege level of the segment. It can be 0-3 where 0 is the most privileged level.
-
-5. The P flag(bit 47) indicates if the segment is present in memory or not. If P is 0, the segment will be presented as _invalid_ and the processor will refuse to read from this segment.
+The remaining of the fields in a segment descriptor represent flags which control different aspects of a segment, like for example type of a memory. Let's take a look at the description of these flags:
-6. AVL flag(bit 52) - Available and reserved bits. It is ignored in Linux.
+- `Type` - describes the type of a memory segment.
+- `S` - distinguishes system segments from code and data segments.
+- `DPL` - provides information about the privilege level of a segment. It can be a value from 0 to 3, where 0 is the most privileged level.
+- `P` - tells the CPU whether a segment presented in memory.
+- `AVL` - available and reserved bits. It is ignored by the Linux kernel.
+- `L` - indicates whether a code segment contains 64-bit code.
+- `D / B` - provides different meaning depends on the type of a segment.
+ - For a code segment: Controls the default operand and address size. If the bit is clear, it is a 16-bit code segment. Otherwise it is a 32-bit code segment.
+ - For a stack segment or in other words a data segment pointed by the `ss` register: Controls the default stack pointer size. If the bit is clear, it is a 16-bit stack segment and stack operations use `sp` register. Otherwise it is a 32-bit stack segment and stack operations use `esp` register.
+ - For a expand-down data segment: Specifies the upper bound of the segment. If the bit is clear, the upper bound is `0xFFFF` or 64 KB. Otherwise, it is `0xFFFFFFFF` or 4 GB.
-7. The L flag(bit 53) indicates whether a code segment contains native 64-bit code. If it is set, then the code segment executes in 64-bit mode.
+If the `S` flag of a segment descriptor is set, the descriptor describes either a code or a data segment, otherwise it is a system segment. If the highest order bit of the `Type` flags is clear - this descriptor describes a data segment, otherwise a code segment. Rest of the three bits of a data segment descriptor interpreted as:
-8. The D/B flag(bit 54) (Default/Big flag) represents the operand size i.e 16/32 bits. If set, operand size is 32 bits. Otherwise, it is 16 bits.
-
-Segment registers contain segment selectors as in real mode. However, in protected mode, a segment selector is handled differently. Each Segment Descriptor has an associated Segment Selector which is a 16-bit structure:
-
-```
- 15 3 2 1 0
------------------------------
-| Index | TI | RPL |
------------------------------
-```
+- `Accessed` - indicates whether a segment has been accessed since the last time the kernel cleared this bit.
+- `Write-Enable` - determines whether a segment is writable or read-only.
+- `Expansion-Direction` - determines whether addresses decreasing from the base address or not.
-Where,
-* **Index** stores the index number of the descriptor in the GDT.
-* **TI**(Table Indicator) indicates where to search for the descriptor. If it is 0 then the descriptor is searched for in the Global Descriptor Table(GDT). Otherwise, it will be searched for in the Local Descriptor Table(LDT).
-* And **RPL** contains the Requester's Privilege Level.
+For a code segment, these three bits interpreted as:
-Every segment register has a visible and a hidden part.
-* Visible - The Segment Selector is stored here.
-* Hidden - The Segment Descriptor (which contains the base, limit, attributes & flags) is stored here.
+- `Accessed` - indicates whether a segment has been accessed since the last time the kernel cleared this bit.
+- `Read-Enable` - determines whether a segment is execute-only or execute-read.
+- `Confirming` - determines how privilege level changes are handled when transferring execution to that segment.
-The following steps are needed to get a physical address in protected mode:
+In the tables below you can find full information about possible states of the flags for a code and a data segments.
-* The segment selector must be loaded in one of the segment registers.
-* The CPU tries to find a segment descriptor at the offset `GDT address + Index` from the selector and then loads the descriptor into the *hidden* part of the segment register.
-* If paging is disabled, the linear address of the segment, or its physical address, is given by the formula: Base address (found in the descriptor obtained in the previous step) + Offset.
+A data segment `Type` field:
-Schematically it will look like this:
+| E (Expand-Down) | W (Writable) | A (Accessed) | Description |
+| --------------- | ------------ | ------------ | --------------------------------- |
+| 0 | 0 | 0 | Read-Only |
+| 0 | 0 | 1 | Read-Only, accessed |
+| 0 | 1 | 0 | Read/Write |
+| 0 | 1 | 1 | Read/Write, accessed |
+| 1 | 0 | 0 | Read-Only, expand-down |
+| 1 | 0 | 1 | Read-Only, expand-down, accessed |
+| 1 | 1 | 0 | Read/Write, expand-down |
+| 1 | 1 | 1 | Read/Write, expand-down, accessed |
-
+A code segment `Type` field:
-The algorithm for the transition from real mode into protected mode is:
+| C (Conforming) | R (Readable) | A (Accessed) | Description |
+| -------------- | ------------ | ------------ | ---------------------------------- |
+| 0 | 0 | 0 | Execute-Only |
+| 0 | 0 | 1 | Execute-Only, accessed |
+| 0 | 1 | 0 | Execute/Read |
+| 0 | 1 | 1 | Execute/Read, accessed |
+| 1 | 0 | 0 | Execute-Only, conforming |
+| 1 | 1 | 0 | Execute/Read, conforming |
+| 1 | 0 | 1 | Execute-Only, conforming, accessed |
+| 1 | 1 | 1 | Execute/Read, conforming, accessed |
-* Disable interrupts
-* Describe and load the GDT with the `lgdt` instruction
-* Set the PE (Protection Enable) bit in CR0 (Control Register 0)
-* Jump to protected mode code
+So far, we’ve looked at how a segment descriptor defines the properties of a memory segment — its base, limit, type, and different flags. But how does the CPU actually refer to one of these descriptors during execution? Just like in real mode - using segment registers. In protected mode they contain segment selectors. However, in protected mode, a segment selector is handled differently. Each segment descriptor has an associated segment selector which is a 16-bit structure:
-We will see the complete transition to protected mode in the linux kernel in the next part, but before we can move to protected mode, we need to do some more preparations.
+
-Let's look at [arch/x86/boot/main.c](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/main.c). We can see some routines there which perform keyboard initialization, heap initialization, etc... Let's take a look.
+The meaning of the fields is:
-Copying boot parameters into the "zeropage"
---------------------------------------------------------------------------------
+- `Index` - the entry number of the descriptor in the descriptor table.
+- `TI` - indicates where to search for the descriptor
+ - If the value of the bit is `0`, a descriptor will be searched in the Global Descriptor Table.
+ - If the value of this bit is `1`, a descriptor will be searched in the Local Descriptor Table.
+- `RPL` - the privilege level requested by the selector.
-We will start from the `main` routine in "main.c". The first function which is called in `main` is [`copy_boot_params(void)`](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/main.c). It copies the kernel setup header into the corresponding field of the `boot_params` structure which is defined in the [arch/x86/include/uapi/asm/bootparam.h](https://github.com/torvalds/linux/blob/v4.16/arch/x86/include/uapi/asm/bootparam.h) header file.
+When a program running in protected mode references a memory, the CPU need to calculate a proper physical address. The following steps are needed to get a physical address in protected mode:
-The `boot_params` structure contains the `struct setup_header hdr` field. This structure contains the same fields as defined in the [linux boot protocol](https://www.kernel.org/doc/Documentation/x86/boot.txt) and is filled by the boot loader and also at kernel compile/build time. `copy_boot_params` does two things:
+1. A segment selector is loaded into one of the segment registers.
+2. The CPU tries to find a associated segment descriptor in the Global Descriptor Table based on the `Index` value from the segment selector. If the descriptor was found, it is loaded into a special hidden part of this segment register.
+3. The physical address will be the base address from the segment descriptor plus offset from the instruction pointer or memory location referenced within an executed instruction.
-1. It copies `hdr` from [header.S](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/header.S#L280) to the `setup_header` field in `boot_params` structure.
+In the next part, we will see the transition into protected mode. But before the kernel can be switched to protected mode, we need to do some more preparations.
-2. It updates the pointer to the kernel command line if the kernel was loaded with the old command line protocol.
+Let's continue from the point where we have stopped in the previous chapter.
-Note that it copies `hdr` with the `memcpy` function, defined in the [copy.S](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/copy.S) source file. Let's have a look inside:
+## Back to the Kernel: Entering main.c
-```assembly
-GLOBAL(memcpy)
- pushw %si
- pushw %di
- movw %ax, %di
- movw %dx, %si
- pushw %cx
- shrw $2, %cx
- rep; movsl
- popw %cx
- andw $3, %cx
- rep; movsb
- popw %di
- popw %si
- retl
-ENDPROC(memcpy)
-```
+As we already have mentioned in the beginning of this chapter, one of the kernel's first main goals is to switch the processor into protected mode. But before this can happen, the kernel need to do some preparations.
-Yeah, we just moved to C code and now assembly again :) First of all, we can see that `memcpy` and other routines which are defined here, start and end with the two macros: `GLOBAL` and `ENDPROC`. `GLOBAL` is described in [arch/x86/include/asm/linkage.h](https://github.com/torvalds/linux/blob/v4.16/arch/x86/include/asm/linkage.h) which defines the `globl` directive and its label. `ENDPROC` is described in [include/linux/linkage.h](https://github.com/torvalds/linux/blob/v4.16/include/linux/linkage.h) and marks the `name` symbol as a function name and ends with the size of the `name` symbol.
+If we look at the very beginning of the `main` function from the [arch/x86/boot/main.c](https://github.com/torvalds/linux/blob/master/arch/x86/boot/main.c), the very first thing we will see is a call of the `init_default_io_ops` function.
-The implementation of `memcpy` is simple. At first, it pushes values from the `si` and `di` registers to the stack to preserve their values because they will change during the `memcpy`. As we can see in the `REALMODE_CFLAGS` in `arch/x86/Makefile`, the kernel build system uses the `-mregparm=3` option of GCC, so functions get the first three parameters from `ax`, `dx` and `cx` registers. Calling `memcpy` looks like this:
+This function defined in the [arch/x86/boot/io.h](https://github.com/torvalds/linux/blob/master/arch/x86/boot/io.h) and looks like:
-```c
-memcpy(&boot_params.hdr, &hdr, sizeof hdr);
+
+```C
+static inline void init_default_io_ops(void)
+{
+ pio_ops.f_inb = __inb;
+ pio_ops.f_outb = __outb;
+ pio_ops.f_outw = __outw;
+}
```
-So,
-* `ax` will contain the address of `boot_params.hdr`
-* `dx` will contain the address of `hdr`
-* `cx` will contain the size of `hdr` in bytes.
-
-`memcpy` puts the address of `boot_params.hdr` into `di` and saves `cx` on the stack. After this it shifts the value right 2 times (or divides it by 4) and copies four bytes from the address at `si` to the address at `di`. After this, we restore the size of `hdr` again, align it by 4 bytes and copy the rest of the bytes from the address at `si` to the address at `di` byte by byte (if there is more). Now the values of `si` and `di` are restored from the stack and the copying operation is finished.
-
-Console initialization
---------------------------------------------------------------------------------
-
-After `hdr` is copied into `boot_params.hdr`, the next step is to initialize the console by calling the `console_init` function, defined in [arch/x86/boot/early_serial_console.c](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/early_serial_console.c).
-
-It tries to find the `earlyprintk` option in the command line and if the search was successful, it parses the port address and baud rate of the serial port and initializes the serial port. The value of the `earlyprintk` command line option can be one of these:
+This function initializes function pointers for:
-* serial,0x3f8,115200
-* serial,ttyS0,115200
-* ttyS0,115200
+- reading a byte from an I/O port
+- writing a byte to an I/O port
+- writing a word (16-bit) to an I/O port
-After serial port initialization we can see the first output:
+These callbacks will be used to write data to the serial console which will be initialized at the one of the next steps. All the operations will be executed with the help of the `inb`, `outb`, and `outw` macros which defined in the same file:
+
```C
-if (cmdline_find_option_bool("debug"))
- puts("early console in setup code\n");
+#define inb pio_ops.f_inb
+#define outb pio_ops.f_outb
+#define outw pio_ops.f_outw
```
-The definition of `puts` is in [tty.c](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/tty.c). As we can see it prints character by character in a loop by calling the `putchar` function. Let's look into the `putchar` implementation:
+The `__inb`, `__outb`, and `__outw` themselves are inline functions from the [arch/x86/include/asm/shared/io.h](https://github.com/torvalds/linux/blob/master/arch/x86/include/asm/shared/io.h):
+
```C
-void __attribute__((section(".inittext"))) putchar(int ch)
-{
- if (ch == '\n')
- putchar('\r');
-
- bios_putchar(ch);
-
- if (early_serial_base != 0)
- serial_putchar(ch);
+#define BUILDIO(bwl, bw, type) \
+static __always_inline void __out##bwl(type value, u16 port) \
+{ \
+ asm volatile("out" #bwl " %" #bw "0, %w1" \
+ : : "a"(value), "Nd"(port)); \
+} \
+ \
+static __always_inline type __in##bwl(u16 port) \
+{ \
+ type value; \
+ asm volatile("in" #bwl " %w1, %" #bw "0" \
+ : "=a"(value) : "Nd"(port)); \
+ return value; \
}
+
+BUILDIO(b, b, u8)
+BUILDIO(w, w, u16)
+BUILDIO(l, , u32)
```
-`__attribute__((section(".inittext")))` means that this code will be in the `.inittext` section. We can find it in the linker file [setup.ld](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/setup.ld).
+All of these functions use `in` and `out` assembly instructions which send the given value to the given port or read the value from the given port. If the syntax is not familiar to you, you can read the chapter about [inline assembly](https://github.com/0xAX/linux-insides/blob/master/Theory/linux-theory-3.md).
-First of all, `putchar` checks for the `\n` symbol and if it is found, prints `\r` before. After that it prints the character on the VGA screen by calling the BIOS with the `0x10` interrupt call:
+After initialization of callbacks for writing to a serial port, the next step is copying of the kernel setup header filled by a bootloader into the corresponding field of the C `boot_params` structure. This will make the fields from the kernel setup header more easily accessible. All the job by copying handled by the `copy_boot_params` function with the help of `memcpy`:
+
```C
-static void __attribute__((section(".inittext"))) bios_putchar(int ch)
-{
- struct biosregs ireg;
-
- initregs(&ireg);
- ireg.bx = 0x0007;
- ireg.cx = 0x0001;
- ireg.ah = 0x0e;
- ireg.al = ch;
- intcall(0x10, &ireg, NULL);
-}
+ memcpy(&boot_params.hdr, &hdr, sizeof(hdr));
```
-Here `initregs` takes the `biosregs` structure and first fills `biosregs` with zeros using the `memset` function and then fills it with register values.
+Do not mix this `memcpy` with the function from the C standard library - [memcpy](https://man7.org/linux/man-pages/man3/memcpy.3.html). During the time when the kernel is in the early initialization phase, there is no way to load any library. For this reason, an operating system kernel provides own implementation of such functions. The kernel's `memcpy` defined in the [copy.S](https://github.com/torvalds/linux/blob/master/arch/x86/boot/copy.S). If you already started to miss an assembly code, this is the high time to bring some back:
-```C
- memset(reg, 0, sizeof *reg);
- reg->eflags |= X86_EFLAGS_CF;
- reg->ds = ds();
- reg->es = ds();
- reg->fs = fs();
- reg->gs = gs();
+
+```assembly
+SYM_FUNC_START_NOALIGN(memcpy)
+ pushw %si
+ pushw %di
+ movw %ax, %di
+ movw %dx, %si
+ pushw %cx
+ shrw $2, %cx
+ rep movsl
+ popw %cx
+ andw $3, %cx
+ rep movsb
+ popw %di
+ popw %si
+ retl
+SYM_FUNC_END(memcpy)
```
-Let's look at the implementation of [memset](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/copy.S#L36):
+First of all, we can see that `memcpy` and other routines which are defined there, start and end with the two macros - `SYM_FUNC_START_NOALIGN` and `SYM_FUNC_END`. The `SYM_FUNC_START_NOALIGN` just specifies the given symbol name as [.globl](https://sourceware.org/binutils/docs/as.html#Global) to make it visible for other functions. The `SYM_FUNC_END` just expands to an empty string in our case.
-```assembly
-GLOBAL(memset)
- pushw %di
- movw %ax, %di
- movzbl %dl, %eax
- imull $0x01010101,%eax
- pushw %cx
- shrw $2, %cx
- rep; stosl
- popw %cx
- andw $3, %cx
- rep; stosb
- popw %di
- retl
-ENDPROC(memset)
-```
+Despite the implementation of this function is written in assembly language, the implementation of `memcpy` is relatively simple. At first, it pushes values from the `si` and `di` registers to the stack to preserve their values because they will change during the `memcpy` execution. At the next step we may see handling of the function's parameters. The parameters of this function are passed through the `ax`, `dx`, and `cx` registers. This is because the kernel setup code is built with `-mregparm=3` option. So:
-As you can read above, it uses the same calling conventions as the `memcpy` function, which means that the function gets its parameters from the `ax`, `dx` and `cx` registers.
+- `ax` will contain the address of `boot_params.hdr`
+- `dx` will contain the address of `hdr`
+- `cx` will contain the size of `hdr` in bytes
-The implementation of `memset` is similar to that of memcpy. It saves the value of the `di` register on the stack and puts the value of`ax`, which stores the address of the `biosregs` structure, into `di` . Next is the `movzbl` instruction, which copies the value of `dl` to the lowermost byte of the `eax` register. The remaining 3 high bytes of `eax` will be filled with zeros.
+The `rep movsl` instruction copies bytes from the memory pointed by the `si` register to the memory location pointed by the `di` register. At each iteration 4 bytes copied. For this reason we divided the size of the setup header by 4 using `shrw` instruction. After this step we just copy rest of bytes that is not divided by 4.
-The next instruction multiplies `eax` with `0x01010101`. It needs to because `memset` will copy 4 bytes at the same time. For example, if we need to fill a structure whose size is 4 bytes with the value `0x7` with memset, `eax` will contain the `0x00000007`. So if we multiply `eax` with `0x01010101`, we will get `0x07070707` and now we can copy these 4 bytes into the structure. `memset` uses the `rep; stosl` instruction to copy `eax` into `es:di`.
+From this point, the setup header is copied into a proper place and we can move forward.
-The rest of the `memset` function does almost the same thing as `memcpy`.
+### Console initialization
-After the `biosregs` structure is filled with `memset`, `bios_putchar` calls the [0x10](http://www.ctyme.com/intr/rb-0106.htm) interrupt which prints a character. Afterwards it checks if the serial port was initialized or not and writes a character there with [serial_putchar](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/tty.c) and `inb/outb` instructions if it was set.
+As soon as the kernel setup header is copied into the `boot_params.hdr`, the next step is to initialize the serial console by calling the `console_init` function. Very soon we will be able to print something from within the kernel code!
-Heap initialization
---------------------------------------------------------------------------------
+The `console_init` defined in [arch/x86/boot/early_serial_console.c](https://github.com/torvalds/linux/blob/master/arch/x86/boot/early_serial_console.c). At the very first step it tries to find the `earlyprintk` option in the kernel's command line. If the search was successful, it parses the port address and [baud rate](https://en.wikipedia.org/wiki/Baud) and executes the initialization of the serial port.
-After the stack and bss section have been prepared in [header.S](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/header.S) (see previous [part](linux-bootstrap-1.md)), the kernel needs to initialize the [heap](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/main.c) with the [`init_heap`](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/main.c) function.
+> [!NOTE]
+> If you want to know what else options you can pass in the kernel command line, you can find more information in the [The kernel's command-line parameters](https://github.com/torvalds/linux/blob/master/Documentation/admin-guide/kernel-parameters.rst) document.
-First of all `init_heap` checks the [`CAN_USE_HEAP`](https://github.com/torvalds/linux/blob/v4.16/arch/x86/include/uapi/asm/bootparam.h#L24) flag from the [`loadflags`](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/header.S#L320) structure in the kernel setup header and calculates the end of the stack if this flag was set:
+Let's take a look at these two steps in details.
-```C
- char *stack_end;
+The possible values of the `earlyprintk` command line option are:
- if (boot_params.hdr.loadflags & CAN_USE_HEAP) {
- asm("leal %P1(%%esp),%0"
- : "=r" (stack_end) : "i" (-STACK_SIZE));
-```
+- `serial,0x3f8,115200`
+- `serial,ttyS0,115200`
+- `ttyS0,115200`
-or in other words `stack_end = esp - STACK_SIZE`.
+The parameters defines the name of a serial port, the port number and the baud rate. The pointer to the kernel command line is stored in the kernel setup header and can be accessed through `boot_params.hdr.cmd_line_ptr`. The `parse_earlyprintk` function tries to find the `earlyprintk` option in the kernel command line, parse it if it was found and initialize the serial console parameters with one of the values above. If the `earlyprintk` option is given and contains valid values, the initialization of the serial console takes place in the `early_serial_init` function. There is nothing specific to Linux kernel in the initialization of a serial console, so we will skip this part. If you want to dive deeper by yourself, more information you can find [here](https://wiki.osdev.org/Serial_Ports#Port_Addresses) and learn [arch/x86/boot/early_serial_console.c](https://github.com/torvalds/linux/blob/master/arch/x86/boot/early_serial_console.c) step by step.
-Then there is the `heap_end` calculation:
+After the serial port initialization we can see the first output:
+
```C
- heap_end = (char *)((size_t)boot_params.hdr.heap_end_ptr + 0x200);
+ if (cmdline_find_option_bool("debug"))
+ puts("early console in setup code\n");
```
-which means `heap_end_ptr` or `_end` + `512` (`0x200h`). The last check is whether `heap_end` is greater than `stack_end`. If it is then `stack_end` is assigned to `heap_end` to make them equal.
+The `puts` function uses the `inb` function that we have seen above during initialization of I/O callbacks.
-Now the heap is initialized and we can use it using the `GET_HEAP` method. We will see what it is used for, how to use it and how it is implemented in the next posts.
+From this point we can print messages from the kernel setup code 🎉. Time to move to the next step.
-CPU validation
---------------------------------------------------------------------------------
+### Heap initialization
-The next step as we can see is cpu validation through the `validate_cpu` function from [arch/x86/boot/cpu.c](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/cpu.c) source code file.
-
-It calls the [`check_cpu`](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/cpucheck.c) function and passes cpu level and required cpu level to it and checks that the kernel launches on the right cpu level.
+We have seen the initialization of the `stack` and `bss` memory areas in the previous chapter. The next step is to initialize the [heap](https://en.wikipedia.org/wiki/Memory_management#HEAP) memory area. The heap initialization takes place in the `init_heap` function:
+
```C
-check_cpu(&cpu_level, &req_level, &err_flags);
-if (cpu_level < req_level) {
- ...
- return -1;
+static void init_heap(void)
+{
+ char *stack_end;
+
+ if (boot_params.hdr.loadflags & CAN_USE_HEAP) {
+ stack_end = (char *) (current_stack_pointer - STACK_SIZE);
+ heap_end = (char *) ((size_t)boot_params.hdr.heap_end_ptr + 0x200);
+ if (heap_end > stack_end)
+ heap_end = stack_end;
+ } else {
+ /* Boot protocol 2.00 only, no heap available */
+ puts("WARNING: Ancient bootloader, some functionality may be limited!\n");
+ }
}
```
-The `check_cpu` function checks the CPU's flags, the presence of [long mode](http://en.wikipedia.org/wiki/Long_mode) in the case of x86_64(64-bit) CPU, checks the processor's vendor and makes preparations for certain vendors like turning off SSE+SSE2 for AMD if they are missing, etc.
+First of all, `init_heap` checks the `CAN_USE_HEAP` flag from the kernel setup header. If it is not set, we'll see the warning message. If heap is enabled, the last address of it is set to the `boot_params.hdr.heap_end_ptr` filled by bootloader plus 512 bytes or to the end of the stack if the value specified by bootloader is above it. The beginning of the heap is right after the end of the `.bss` area. The stack size is 1024 bytes. Thereby, the memory map will look like:
+
+
+
+Now the heap is initialized, although we will see the usage of it in the next chapters.
+
+### CPU validation
+
+The next step is the validation of CPU on which the kernel is running. The kernel has to do it to make sure that the all required functionalities will work correctly on the given CPU.
-at the next step, we may see a call to the `set_bios_mode` function after setup code found that a CPU is suitable. As we may see, this function is implemented only for the `x86_64` mode:
+The `validate_cpu` function from [arch/x86/boot/cpu.c](https://github.com/torvalds/linux/blob/master/arch/x86/boot/cpu.c) validates the CPU. This function calls the [`check_cpu`](https://github.com/torvalds/linux/blob/master/arch/x86/boot/cpucheck.c) which check the CPU model and its flags using the [cpuid](https://en.wikipedia.org/wiki/CPUID) instruction. The CPU's flags are checked like the presence of [long mode](http://en.wikipedia.org/wiki/Long_mode), checks the processor's vendor and makes preparations for certain vendors like turning on extensions like [SSE+SSE2](https://en.wikipedia.org/wiki/Single_instruction,_multiple_data):
+
```C
-static void set_bios_mode(void)
+int validate_cpu(void)
{
-#ifdef CONFIG_X86_64
- struct biosregs ireg;
-
- initregs(&ireg);
- ireg.ax = 0xec00;
- ireg.bx = 2;
- intcall(0x15, &ireg, NULL);
-#endif
-}
+ u32 *err_flags;
+ int cpu_level, req_level;
+
+ check_cpu(&cpu_level, &req_level, &err_flags);
+
+ if (cpu_level < req_level) {
+ printf("This kernel requires an %s CPU, ",
+ cpu_name(req_level));
+ printf("but only detected an %s CPU.\n",
+ cpu_name(cpu_level));
+ return -1;
+ }
```
-The `set_bios_mode` function executes the `0x15` BIOS interrupt to tell the BIOS that [long mode](https://en.wikipedia.org/wiki/Long_mode) (if `bx == 2`) will be used.
+If the level of CPU is less than the required level specified by the `CONFIG_X86_MINIMUM_CPU_FAMILY` kernel configuration option, the function returns the error and the kernel setup process is aborted.
-Memory detection
---------------------------------------------------------------------------------
+### Memory detection
-The next step is memory detection through the `detect_memory` function. `detect_memory` basically provides a map of available RAM to the CPU. It uses different programming interfaces for memory detection like `0xe820`, `0xe801` and `0x88`. We will see only the implementation of the **0xE820** interface here.
+After the kernel became sure that the CPU which it is running on is suitable, the next stage is to detect available memory in the system. This task is handled by the `detect_memory` function, which queries the system firmware to obtain a map of physical memory regions. To do this, the kernel uses the special BIOS service - `0xE820`, but kernel can fallback to legacy BIOS services like `0xE801` or `0x88`. In this chapter, we will see only the implementation of the `0xE820` interface.
-Let's look at the implementation of the `detect_memory_e820` function from the [arch/x86/boot/memory.c](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/memory.c) source file. First of all, the `detect_memory_e820` function initializes the `biosregs` structure as we saw above and fills registers with special values for the `0xe820` call:
+The `detect_memory` function defined in the [arch/x86/boot/memory.c](https://github.com/torvalds/linux/blob/master/arch/x86/boot/memory.c) and as just mentioned, tries to get the information about available memory:
-```assembly
- initregs(&ireg);
- ireg.ax = 0xe820;
- ireg.cx = sizeof buf;
- ireg.edx = SMAP;
- ireg.di = (size_t)&buf;
-```
+
+```C
+void detect_memory(void)
+{
+ detect_memory_e820();
+
+ detect_memory_e801();
-* `ax` contains the number of the function (0xe820 in our case)
-* `cx` contains the size of the buffer which will contain data about the memory
-* `edx` must contain the `SMAP` magic number
-* `es:di` must contain the address of the buffer which will contain memory data
-* `ebx` has to be zero.
+ detect_memory_88();
+}
+```
-Next is a loop where data about the memory will be collected. It starts with a call to the `0x15` BIOS interrupt, which writes one line from the address allocation table. For getting the next line we need to call this interrupt again (which we do in the loop). Before the next call `ebx` must contain the value returned previously:
+Let's look at the crucial part of the implementation of the `detect_memory_e820` function. First of all, the `detect_memory_e820` function initializes the `biosregs` structure with the special values related to the `0xE820` BIOS interface:
+
```C
- intcall(0x15, &ireg, &oreg);
- ireg.ebx = oreg.ebx;
+ initregs(&ireg);
+ ireg.ax = 0xe820;
+ ireg.cx = sizeof(buf);
+ ireg.edx = SMAP;
+ ireg.di = (size_t)&buf;
```
-Ultimately, this function collects data from the address allocation table and writes this data into the `e820_entry` array:
+- `ax` register contains the number of the BIOS service
+- `cx` register contains the size of the buffer which will contain the data about available memory
+- `di` register contain the address of the buffer which will contain memory data
+- `edx` register contains the `SMAP` magic number
-* start of memory segment
-* size of memory segment
-* type of memory segment (whether the particular segment is usable or reserved)
+After registers filled with the needed values, the kernel can ask the `0xE820` BIOS interface about available memory. The kernel does it by the invoking `0x15` [BIOS interrupt](https://en.wikipedia.org/wiki/BIOS_interrupt_call) which returns information about one memory region. The kernel repeats this operation in the loop until information about all the memory regions is not collected.
-You can see the result of this in the `dmesg` output, something like:
+After the information is called, the kernel print message about the available memory regions. You can find it in the [dmesg](https://en.wikipedia.org/wiki/Dmesg) output:
```
[ 0.000000] e820: BIOS-provided physical RAM map:
@@ -424,87 +369,79 @@ You can see the result of this in the `dmesg` output, something like:
[ 0.000000] BIOS-e820: [mem 0x00000000fffc0000-0x00000000ffffffff] reserved
```
-Keyboard initialization
---------------------------------------------------------------------------------
+### Keyboard initialization
-The next step is the initialization of the keyboard with a call to the [`keyboard_init`](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/main.c) function. At first `keyboard_init` initializes registers using the `initregs` function. It then calls the [0x16](http://www.ctyme.com/intr/rb-1756.htm) interrupt to query the status of the keyboard.
+Once memory detection is complete, the kernel proceeds with initializing the keyboard using the `keyboard_init`:
-```c
- initregs(&ireg);
- ireg.ah = 0x02; /* Get keyboard status */
- intcall(0x16, &ireg, &oreg);
- boot_params.kbd_status = oreg.al;
-```
+
+```C
+static void keyboard_init(void)
+{
+ struct biosregs ireg, oreg;
+
+ initregs(&ireg);
-After this it calls [0x16](http://www.ctyme.com/intr/rb-1757.htm) again to set the repeat rate and delay.
+ ireg.ah = 0x02; /* Get keyboard status */
+ intcall(0x16, &ireg, &oreg);
+ boot_params.kbd_status = oreg.al;
-```c
- ireg.ax = 0x0305; /* Set keyboard repeat rate */
- intcall(0x16, &ireg, NULL);
+ ireg.ax = 0x0305; /* Set keyboard repeat rate */
+ intcall(0x16, &ireg, NULL);
+}
```
-Querying
---------------------------------------------------------------------------------
+This function performs two tasks using [BIOS interrupt](https://en.wikipedia.org/wiki/BIOS_interrupt_call) `0x16`:
-The next couple of steps are queries for different parameters. We will not dive into details about these queries but we will get back to them in later parts. Let's take a short look at these functions:
+1. Gets the state of a keyboard which contains information about state of certain modifier keys, like for example Caps Lock active or not.
+2. Sets the keyboard repeat rate which determines how long a key must hold down before it begins repeating
-The first step is getting [Intel SpeedStep](http://en.wikipedia.org/wiki/SpeedStep) information by calling the `query_ist` function. It checks the CPU level and if it is correct, calls `0x15` to get the info and saves the result to `boot_params`.
+### Gathering system information
-Next, the [query_apm_bios](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/apm.c#L21) function gets [Advanced Power Management](http://en.wikipedia.org/wiki/Advanced_Power_Management) information from the BIOS. `query_apm_bios` calls the `0x15` BIOS interruption too, but with `ah` = `0x53` to check `APM` installation. After `0x15` finishes executing, the `query_apm_bios` functions check the `PM` signature (it must be `0x504d`), the carry flag (it must be 0 if `APM` supported) and the value of the `cx` register (if it's 0x02, the protected mode interface is supported).
+After we went though the most essential hardware interfaces like CPU, I/O, memory map, keyboard, the next a couple of steps are to query the BIOS for additional information about the system. The information which kernel is going to gather is not strictly required for entering protected mode, but it provides useful details that later parts of the kernel may rely on.
-Next, it calls `0x15` again, but with `ax = 0x5304` to disconnect the `APM` interface and connect the 32-bit protected mode interface. In the end, it fills `boot_params.apm_bios_info` with values obtained from the BIOS.
+The following information is going to be collected:
-Note that `query_apm_bios` will be executed only if the `CONFIG_APM` or `CONFIG_APM_MODULE` compile time flag was set in the configuration file:
+- Information about [Intel SpeedStep](http://en.wikipedia.org/wiki/SpeedStep)
+- Information about [Advanced Power Management](http://en.wikipedia.org/wiki/Advanced_Power_Management)
+- Information about [Enhanced Disk Drive](https://en.wikipedia.org/wiki/INT_13H)
+At this moment we will not dive into details about each of this query, but will get back to them in the next parts when we will use this information. For now, just let's take a short look at these functions:
+
+
```C
+ /* Query Intel SpeedStep (IST) information */
+ query_ist();
+
+ /* Query APM information */
#if defined(CONFIG_APM) || defined(CONFIG_APM_MODULE)
- query_apm_bios();
+ query_apm_bios();
#endif
-```
-
-The last is the [`query_edd`](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/edd.c#L122) function, which queries `Enhanced Disk Drive` information from the BIOS. Let's look at how `query_edd` is implemented.
-
-First of all, it reads the [edd](https://github.com/torvalds/linux/blob/v4.16/Documentation/admin-guide/kernel-parameters.rst) option from the kernel's command line and if it was set to `off` then `query_edd` just returns.
-If EDD is enabled, `query_edd` goes over BIOS-supported hard disks and queries EDD information in the following loop:
-
-```C
-for (devno = 0x80; devno < 0x80+EDD_MBR_SIG_MAX; devno++) {
- if (!get_edd_info(devno, &ei) && boot_params.eddbuf_entries < EDDMAXNR) {
- memcpy(edp, &ei, sizeof ei);
- edp++;
- boot_params.eddbuf_entries++;
- }
- ...
- ...
- ...
- }
+ /* Query EDD information */
+#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
+ query_edd();
+#endif
```
-where `0x80` is the first hard drive and the value of the `EDD_MBR_SIG_MAX` macro is 16. It collects data into an array of [edd_info](https://github.com/torvalds/linux/blob/v4.16/include/uapi/linux/edd.h) structures. `get_edd_info` checks that EDD is present by invoking the `0x13` interrupt with `ah` as `0x41` and if EDD is present, `get_edd_info` again calls the `0x13` interrupt, but with `ah` as `0x48` and `si` containing the address of the buffer where EDD information will be stored.
+The first one is getting information about the [Intel SpeedStep](http://en.wikipedia.org/wiki/SpeedStep). This information is obtained by the calling the `0x15` BIOS interrupt and store the result in the `boot_params` structure. The returned information describes the support of the Intel SpeedStep and settings around it. If it is supported, this information will be passed later by the kernel to the power management subsystems.
+
+The next one is getting information about the [Advanced Power Management](http://en.wikipedia.org/wiki/Advanced_Power_Management). The logic of this function is pretty similar to the one described above. It uses the same `0x15` BIOS interrupt to obtain information and store it in the `boot_params` structure. The returned information describes the support of the `APM` which was power management sub-system before [ACPI](https://en.wikipedia.org/wiki/ACPI) started to be a standard.
-Conclusion
---------------------------------------------------------------------------------
+The last one function gets information about the `Enhanced Disk Drive` from the BIOS. The same `0x13` BIOS interrupt is used to obtain this information. The returned information describes the disks and their characteristics like geometry and mapping information.
-This is the end of the second part about the insides of the Linux kernel. In the next part, we will see video mode setting and the rest of the preparations before the transition to protected mode and directly transitioning into it.
+## Conclusion
-If you have any questions or suggestions write me a comment or ping me at [twitter](https://twitter.com/0xAX).
+This is the end of the second part about Linux kernel insides. If you have questions or suggestions, feel free ping me on X - [0xAX](https://twitter.com/0xAX), drop me an [email](mailto:anotherworldofworld@gmail.com), or just create an [issue](https://github.com/0xAX/linux-insides/issues/new). In the next part, we will continue to deal with the preparations before transitioning into protected mode and the transitioning itself.
-**Please note that English is not my first language, And I am really sorry for any inconvenience. If you find any mistakes please send me a PR to [linux-insides](https://github.com/0xAX/linux-internals).**
+## Links
-Links
---------------------------------------------------------------------------------
+Here is the list of the links that you may find useful during reading of this chapter:
-* [Protected mode](http://en.wikipedia.org/wiki/Protected_mode)
-* [Protected mode](http://wiki.osdev.org/Protected_Mode)
-* [Long mode](http://en.wikipedia.org/wiki/Long_mode)
-* [Nice explanation of CPU Modes with code](http://www.codeproject.com/Articles/45788/The-Real-Protected-Long-mode-assembly-tutorial-for)
-* [How to Use Expand Down Segments on Intel 386 and Later CPUs](http://www.sudleyplace.com/dpmione/expanddown.html)
-* [earlyprintk documentation](https://github.com/torvalds/linux/blob/v4.16/Documentation/x86/earlyprintk.txt)
-* [Kernel Parameters](https://github.com/torvalds/linux/blob/v4.16/Documentation/admin-guide/kernel-parameters.rst)
-* [Serial console](https://github.com/torvalds/linux/blob/v4.16/Documentation/admin-guide/serial-console.rst)
-* [Intel SpeedStep](http://en.wikipedia.org/wiki/SpeedStep)
-* [APM](https://en.wikipedia.org/wiki/Advanced_Power_Management)
-* [EDD specification](http://www.t13.org/documents/UploadedDocuments/docs2004/d1572r3-EDD3.pdf)
-* [TLDP documentation for Linux Boot Process](http://www.tldp.org/HOWTO/Linux-i386-Boot-Code-HOWTO/setup.html) (old)
-* [Previous Part](linux-bootstrap-1.md)
+- [Protected mode](http://en.wikipedia.org/wiki/Protected_mode)
+- [Long mode](http://en.wikipedia.org/wiki/Long_mode)
+- [The kernel's command-line parameters](https://github.com/torvalds/linux/blob/master/Documentation/admin-guide/kernel-parameters.rst)
+- [Linux serial console](https://github.com/torvalds/linux/blob/master/Documentation/admin-guide/serial-console.rst)
+- [BIOS interrupt](https://en.wikipedia.org/wiki/BIOS_interrupt_call)
+- [Intel SpeedStep](http://en.wikipedia.org/wiki/SpeedStep)
+- [APM](https://en.wikipedia.org/wiki/Advanced_Power_Management)
+- [EDD specification](http://www.t13.org/documents/UploadedDocuments/docs2004/d1572r3-EDD3.pdf)
diff --git a/Booting/linux-bootstrap-3.md b/Booting/linux-bootstrap-3.md
index 46aa3bca..28b07262 100644
--- a/Booting/linux-bootstrap-3.md
+++ b/Booting/linux-bootstrap-3.md
@@ -1,26 +1,69 @@
-Kernel booting process. Part 3.
-================================================================================
+# Kernel booting process. Part 3.
-Video mode initialization and transition to protected mode
---------------------------------------------------------------------------------
+In the previous [part](./linux-bootstrap-2.md), we have seen first pieces of C code that run in the Linux kernel. One of the main goal of this stage is to switch into the [protected mode](https://en.wikipedia.org/wiki/Protected_mode), but before this, we have seen some early setup code which executes early initialization procedures, such as:
-This is the third part of the `Kernel booting process` series. In the previous [part](linux-bootstrap-2.md#kernel-booting-process-part-2), we stopped right before the call to the `set_video` routine from [main.c](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/main.c).
+- Setup of console to be able to print messages from the kernel's setup code
+- Validation of CPU
+- Detection of available memory
+- Initialization of keyboard
-In this part, we will look at:
+In this part we will continue to explore the next steps before we will see the transition into the protected mode.
-* Video mode initialization in the kernel setup code,
-* the preparations made before switching into protected mode,
-* the transition to protected mode
+## Video mode setup
-**NOTE** If you don't know anything about protected mode, you can find some information about it in the previous [part](linux-bootstrap-2.md#protected-mode). Also, there are a couple of [links](linux-bootstrap-2.md#links) which can help you.
+Previously, we stopped right at the point where the kernel setup code was about to initialize the video mode.
-As I wrote above, we will start from the `set_video` function which is defined in the [arch/x86/boot/video.c](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/video.c) source code file. We can see that it starts by first getting the video mode from the `boot_params.hdr` structure:
+The setup code is located in the [arch/x86/boot/video.c](https://github.com/torvalds/linux/blob/master/arch/x86/boot/video.c) and implemented by the `set_video` function. Now let's take a look at the implementation of the `set_video` function:
+
+
+```C
+void set_video(void)
+{
+ u16 mode = boot_params.hdr.vid_mode;
+
+ RESET_HEAP();
+
+ store_mode_params();
+ save_screen();
+ probe_cards(0);
+
+ for (;;) {
+ if (mode == ASK_VGA)
+ mode = mode_menu();
+
+ if (!set_mode(mode))
+ break;
+
+ printf("Undefined video mode number: %x\n", mode);
+ mode = ASK_VGA;
+ }
+ boot_params.hdr.vid_mode = mode;
+ vesa_store_edid();
+ store_mode_params();
+
+ if (do_restore)
+ restore_screen();
+}
+```
+
+Let's try to understand what this function does in the next sections.
+
+### Video modes
+
+The implementation of the `set_video` function starts by getting the video mode from the `boot_params.hdr` structure:
```C
u16 mode = boot_params.hdr.vid_mode;
```
-which we filled in the `copy_boot_params` function (you can read about it in the previous post). `vid_mode` is an obligatory field which is filled by the bootloader. You can find information about it in the kernel `boot protocol`:
+> [!NOTE]
+> Instead of old good standard C data types like `int`, `short`, `unsigned short`, Linux kernel provides own data types for numeric values. Here is the table that will help you to remember them:
+>
+> | Type | char | short | int | long | u8 | u16 | u32 | u64 |
+> |------|------|-------|-----|------|----|-----|-----|-----|
+> | Size | 1 | 2 | 4 | 8 | 1 | 2 | 4 | 8 |
+
+The initial value of the video mode can be filled by the bootloader. This header field defined in the Linux kernel boot protocol:
```
Offset Proto Name Meaning
@@ -28,7 +71,7 @@ Offset Proto Name Meaning
01FA/2 ALL vid_mode Video mode control
```
-As we can read from the linux kernel boot protocol:
+Information about potential values for this field can be also found in the Linux kernel boot protocol document:
```
vga=
@@ -40,60 +83,86 @@ vga=
line is parsed.
```
-So we can add the `vga` option to the grub (or another bootloader's) configuration file and it will pass this option to the kernel command line. This option can have different values as mentioned in the description. For example, it can be an integer number `0xFFFD` or `ask`. If you pass `ask` to `vga`, you will see a menu like this:
-
-
-
-which will ask to select a video mode. We will look at its implementation, but before diving into the implementation we have to look at some other things.
-
-Kernel data types
---------------------------------------------------------------------------------
-
-Earlier we saw definitions of different data types like `u16` etc. in the kernel setup code. Let's look at a couple of data types provided by the kernel:
-
-
-| Type | char | short | int | long | u8 | u16 | u32 | u64 |
-|------|------|-------|-----|------|----|-----|-----|-----|
-| Size | 1 | 2 | 4 | 8 | 1 | 2 | 4 | 8 |
-
-If you read the source code of the kernel, you'll see these very often and so it will be good to remember them.
-
-Heap API
---------------------------------------------------------------------------------
-
-After we get `vid_mode` from `boot_params.hdr` in the `set_video` function, we can see the call to the `RESET_HEAP` function. `RESET_HEAP` is a macro which is defined in [arch/x86/boot/boot.h](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/boot.h) header file.
-
-This macro is defined as:
-
+This tells us that we can add the `vga` option to the GRUB (or another bootloader's) configuration file and it will pass this option to the kernel command line. This option can have different values as mentioned in the description above. For example, it can be an integer number `0xFFFD` or `ask`. If you pass `ask` to `vga`, you will see a menu with the possible video modes. We can test it using [QEMU](https://www.qemu.org/) virtual machine:
+
+```bash
+sudo qemu-system-x86_64 -kernel ./linux/arch/x86/boot/bzImage \
+ -nographic \
+ -append "console=ttyS0 nokaslr vga=ask" \
+ -initrd /boot/initramfs-6.17.0-rc3-g1b237f190eb3.img
+```
+
+If you did everything correctly, after the kernel is loaded it will ask you to press the `ENTER`. By pressing on it you should see something like this:
+
+```
+Booting from ROM...
+Probing EDD (edd=off to disable)... ok
+Press to see video modes available, to continue, or wait 30 sec
+Mode: Resolution: Type: Mode: Resolution: Type: Mode: Resolution: Type:
+0 F00 80x25 VGA 1 F01 80x50 VGA 2 F02 80x43 VGA
+3 F03 80x28 VGA 4 F05 80x30 VGA 5 F06 80x34 VGA
+6 F07 80x60 VGA 7 340 320x200x32 VESA 8 341 640x400x32 VESA
+9 342 640x480x32 VESA a 343 800x600x32 VESA b 344 1024x768x32 VESA
+c 345 1280x1024x32 VESA d 347 1600x1200x32 VESA e 34C 1152x864x32 VESA
+f 377 1280x768x32 VESA g 37A 1280x800x32 VESA h 37D 1280x960x32 VESA
+i 380 1440x900x32 VESA j 383 1400x1050x32 VESA k 386 1680x1050x32 VESA
+l 389 1920x1200x32 VESA m 38C 2560x1600x32 VESA n 38F 1280x720x32 VESA
+o 392 1920x1080x32 VESA p 300 640x400x8 VESA q 301 640x480x8 VESA
+r 303 800x600x8 VESA s 305 1024x768x8 VESA t 307 1280x1024x8 VESA
+u 30D 320x200x15 VESA v 30E 320x200x16 VESA w 30F 320x200x24 VESA
+x 310 640x480x15 VESA y 311 640x480x16 VESA z 312 640x480x24 VESA
+ 313 800x600x15 VESA 314 800x600x16 VESA 315 800x600x24 VESA
+ 316 1024x768x15 VESA 317 1024x768x16 VESA 318 1024x768x24 VESA
+ 319 1280x1024x15 VESA 31A 1280x1024x16 VESA 31B 1280x1024x24 VESA
+ 31C 1600x1200x8 VESA 31D 1600x1200x15 VESA 31E 1600x1200x16 VESA
+ 31F 1600x1200x24 VESA 346 320x200x8 VESA 348 1152x864x8 VESA
+ 349 1152x864x15 VESA 34A 1152x864x16 VESA 34B 1152x864x24 VESA
+ 375 1280x768x16 VESA 376 1280x768x24 VESA 378 1280x800x16 VESA
+ 379 1280x800x24 VESA 37B 1280x960x16 VESA 37C 1280x960x24 VESA
+ 37E 1440x900x16 VESA 37F 1440x900x24 VESA 381 1400x1050x16 VESA
+ 382 1400x1050x24 VESA 384 1680x1050x16 VESA 385 1680x1050x24 VESA
+ 387 1920x1200x16 VESA 388 1920x1200x24 VESA 38A 2560x1600x16 VESA
+ 38B 2560x1600x24 VESA 38D 1280x720x16 VESA 38E 1280x720x24 VESA
+ 390 1920x1080x16 VESA 391 1920x1080x24 VESA 393 1600x900x16 VESA
+ 394 1600x900x24 VESA 395 1600x900x32 VESA 396 2560x1440x16 VESA
+ 397 2560x1440x24 VESA 398 2560x1440x32 VESA 399 3840x2160x16 VESA
+ 200 40x25 VESA 201 40x25 VESA 202 80x25 VESA
+ 203 80x25 VESA 207 80x25 VESA 213 320x200x8 VESA
+Enter a video mode or "scan" to scan for additional modes:
+```
+
+### Early heap API
+
+Before proceeding further to investigate what the `set_video` function does, it will be useful to take a look at the API for the management of the kernel's early heap.
+
+After getting the video mode set by the bootloader, we can see reseting the heap value by the `RESET_HEAP` macro. The definition of this macro is in the [arch/x86/boot/boot.h](https://github.com/torvalds/linux/blob/master/arch/x86/boot/boot.h):
+
+
```C
#define RESET_HEAP() ((void *)( HEAP = _end ))
```
-If you have read the second part, you will remember that we initialized the heap with the [`init_heap`](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/main.c) function. We have a couple of utility macros and functions for managing the heap which are defined in `arch/x86/boot/boot.h` header file.
+If you have read the [part](./linux-bootstrap-2.md#kernel-booting-process-part-2), you should remember that we have seen initialization of the heap memory area.The kernel setup code provides a couple of utility macros and functions for managing the early heap. Let's take a look at some of them, especially at ones which we will meet in this chapter.
-They are:
+The `RESET_HEAP` macro resets the heap by setting the `HEAP` variable to the `_end` which represents the end of the early setup kernel's `text` (or code) section. By doing this we just set the heap pointer to the very beginning of the heap.
-```C
-#define RESET_HEAP()
-```
-
-As we saw just above, it resets the heap by setting the `HEAP` variable to `_end`, where `_end` is just `extern char _end[];`
-
-Next is the `GET_HEAP` macro:
+The next useful macro is:
+
```C
#define GET_HEAP(type, n) \
((type *)__get_heap(sizeof(type),__alignof__(type),(n)))
```
-for heap allocation. It calls the internal function `__get_heap` with 3 parameters:
+The goal of this macro is to allocate memory on the early heap. This macro calls the `__get_heap` function from the same header file with the following three parameters:
-* the size of the datatype to be allocated for
-* `__alignof__(type)` specifies how variables of this type are to be aligned
-* `n` specifies how many items to allocate
+- The size of the datatype to be allocated for
+- Specifies how variables of this type are to be aligned
+- How many items specified by the first parameter to allocate
The implementation of `__get_heap` is:
+
```C
static inline char *__get_heap(size_t s, size_t a, size_t n)
{
@@ -106,64 +175,27 @@ static inline char *__get_heap(size_t s, size_t a, size_t n)
}
```
-and we will further see its usage, something like:
+Let's try to understand how the `__get_heap` function works. First of all we can see here that `HEAP` pointer is assigned to the [aligned](https://en.wikipedia.org/wiki/Data_structure_alignment) address of the memory. The address is aligned based on the size of data type for which we want to allocate memory. After we have got the initial aligned address, we just move the `HEAP` pointer by the requested size.
-```C
-saved.data = GET_HEAP(u16, saved.x * saved.y);
-```
-
-Let's try to understand how `__get_heap` works. We can see here that `HEAP` (which is equal to `_end` after `RESET_HEAP()`) is assigned the address of the aligned memory according to the `a` parameter. After this we save the memory address from `HEAP` to the `tmp` variable, move `HEAP` to the end of the allocated block and return `tmp` which is the start address of allocated memory.
-
-And the last function is:
+The last but not least API of the early heap that we will see is the `heap_free` function which checks the availability of the given size of memory on the heap:
+
```C
static inline bool heap_free(size_t n)
{
- return (int)(heap_end - HEAP) >= (int)n;
+ return (int)(heap_end-HEAP) >= (int)n;
}
```
-which subtracts value of the `HEAP` pointer from the `heap_end` (we calculated it in the previous [part](linux-bootstrap-2.md)) and returns 1 if there is enough memory available for `n`.
-
-That's all. Now we have a simple API for heap and can setup video mode.
-
-Set up video mode
---------------------------------------------------------------------------------
+As you may see, the implementation of this function is pretty trivial. It just subtracts the current value of the heap pointer from the address which represents the end of heap memory area. The function returns `true` if there is enough memory for `n` or `false` otherwise.
-Now we can move directly to video mode initialization. We stopped at the `RESET_HEAP()` call in the `set_video` function. Next is the call to `store_mode_params` which stores video mode parameters in the `boot_params.screen_info` structure which is defined in [include/uapi/linux/screen_info.h](https://github.com/torvalds/linux/blob/v4.16/include/uapi/linux/screen_info.h) header file.
+### Return to the setup of the video mode
-If we look at the `store_mode_params` function, we can see that it starts with a call to the `store_cursor_position` function. As you can understand from the function name, it gets information about the cursor and stores it.
+Since the heap pointer is in the right place, we can move directly to video mode initialization. The next step after this is the call to `store_mode_params` function which stores currently available video mode parameters in the `boot_params.screen_info`. This structure defined in the [include/uapi/linux/screen_info.h](https://github.com/torvalds/linux/blob/master/include/uapi/linux/screen_info.hh) header file and provides basic information about the screen and video mode. Such as current position of the cursor, the BIOS video mode number that was set when the kernel was loaded, the number of text rows and columns and so on. The `store_mode_params` function asks the BIOS services about this information and stores it in this structure for later usage.
-First of all, `store_cursor_position` initializes two variables which have type `biosregs` with `AH = 0x3`, and calls the `0x10` BIOS interruption. After the interruption is successfully executed, it returns row and column in the `DL` and `DH` registers. Row and column will be stored in the `orig_x` and `orig_y` fields of the `boot_params.screen_info` structure.
-
-After `store_cursor_position` is executed, the `store_video_mode` function will be called. It just gets the current video mode and stores it in `boot_params.screen_info.orig_video_mode`.
-
-After this, `store_mode_params` checks the current video mode and sets the `video_segment`. After the BIOS transfers control to the boot sector, the following addresses are for video memory:
-
-```
-0xB000:0x0000 32 Kb Monochrome Text Video Memory
-0xB800:0x0000 32 Kb Color Text Video Memory
-```
-
-So we set the `video_segment` variable to `0xb000` if the current video mode is MDA, HGC, or VGA in monochrome mode and to `0xb800` if the current video mode is in color mode. After setting up the address of the video segment, the font size needs to be stored in `boot_params.screen_info.orig_video_points` with:
-
-```C
-set_fs(0);
-font_size = rdfs16(0x485);
-boot_params.screen_info.orig_video_points = font_size;
-```
-
-First of all, we put 0 in the `FS` register with the `set_fs` function. We already saw functions like `set_fs` in the previous part. They are all defined in [arch/x86/boot/boot.h](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/boot.h). Next, we read the value which is located at address `0x485` (this memory location is used to get the font size) and save the font size in `boot_params.screen_info.orig_video_points`.
-
-```C
-x = rdfs16(0x44a);
-y = (adapter == ADAPTER_CGA) ? 25 : rdfs8(0x484)+1;
-```
-
-Next, we get the amount of columns by address `0x44a` and rows by address `0x484` and store them in `boot_params.screen_info.orig_video_cols` and `boot_params.screen_info.orig_video_lines`. After this, execution of `store_mode_params` is finished.
-
-Next we can see the `save_screen` function which just saves the contents of the screen to the heap. This function collects all the data which we got in the previous functions (like the rows and columns, and stuff) and stores it in the `saved_screen` structure, which is defined as:
+The next step is save the current contents of the screen to the heap by calling the `save_screen` function. This function collects all the data which we got in the previous functions (like the rows and columns, and stuff) and stores it in the `saved_screen` structure, which is defined as:
+
```C
static struct saved_screen {
int x, y;
@@ -172,25 +204,25 @@ static struct saved_screen {
} saved;
```
-It then checks whether the heap has free space for it with:
+After the contents of the screen is saved, the next step is to collect currently available video modes in the system. This job is done by the `probe_cards` function defined in the [arch/x86/boot/video-mode.c](https://github.com/torvalds/linux/blob/master/arch/x86/boot/video-mode.c). It goes over all `video_cards` and collects the information about them:
```C
-if (!heap_free(saved.x*saved.y*sizeof(u16)+512))
- return;
+for (card = video_cards; card < video_cards_end; card++) {
+ /* collecting the number of video modes */
+}
```
-and allocates space in the heap if it is enough and stores `saved_screen` in it.
-
-The next call is `probe_cards(0)` from [arch/x86/boot/video-mode.c](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/video-mode.c) source code file. It goes over all video_cards and collects the number of modes provided by the cards. Here is the interesting part, we can see the loop:
+The `video_cards` is an array defined as:
+
```C
-for (card = video_cards; card < video_cards_end; card++) {
- /* collecting number of modes here */
-}
+#define __videocard struct card_info __section(".videocards") __attribute__((used))
+extern struct card_info video_cards[], video_cards_end[];
```
-but `video_cards` is not declared anywhere. The answer is simple: every video mode presented in the x86 kernel setup code has a definition that looks like this:
+The `__videocard` macro allows to define structures which describe video cards and the linker will put them into the `video_cards` array. Example of such structure can be found in the [arch/x86/boot/video-vga.c](https://github.com/torvalds/linux/blob/master/arch/x86/boot/video-vga.c):
+
```C
static __videocard video_vga = {
.card_name = "VGA",
@@ -199,50 +231,18 @@ static __videocard video_vga = {
};
```
-where `__videocard` is a macro:
-
-```C
-#define __videocard struct card_info __attribute__((used,section(".videocards")))
-```
-
-which means that the `card_info` structure:
-
-```C
-struct card_info {
- const char *card_name;
- int (*set_mode)(struct mode_info *mode);
- int (*probe)(void);
- struct mode_info *modes;
- int nmodes;
- int unsafe;
- u16 xmode_first;
- u16 xmode_n;
-};
-```
-
-is in the `.videocards` segment. Let's look in the [arch/x86/boot/setup.ld](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/setup.ld) linker script, where we can find:
-
-```
- .videocards : {
- video_cards = .;
- *(.videocards)
- video_cards_end = .;
- }
-```
+After the `probe_cards` function executes we have a bunch of structures in our `video_cards` array and the known number of video modes they provide. At the next step the kernel setup code will print menu with available video modes if the `vid_mode=ask` option was passed to the kernel command line and set up the video mode having all the parameters that we have gathered at the previous steps. The video mode is set by the `set_mode` function is defined in [video-mode.c](https://github.com/torvalds/linux/blob/master/arch/x86/boot/video-mode.c). This function expects one parameter - the video mode identifier. This identifier is set by the bootloader or set based on the choice of the video modes menu. The `set_mode` function goes over all available video cards defined in the `video_cards` array and if the given mode belongs to the given card, the `card->set_mode()` callback is called to setup the video mode.
-It means that `video_cards` is just a memory address and all `card_info` structures are placed in this segment. It means that all `card_info` structures are placed between `video_cards` and `video_cards_end`, so we can use a loop to go over all of it. After `probe_cards` executes we have a bunch of structures like `static __videocard video_vga` with the `nmodes` (the number of video modes) filled in.
-
-After the `probe_cards` function is done, we move to the main loop in the `set_video` function. There is an infinite loop which tries to set up the video mode with the `set_mode` function or prints a menu if we passed `vid_mode=ask` to the kernel command line or if video mode is undefined.
-
-The `set_mode` function is defined in [video-mode.c](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/video-mode.c) and gets only one parameter, `mode`, which is the number of video modes (we got this value from the menu or in the start of `setup_video`, from the kernel setup header).
-
-The `set_mode` function checks the `mode` and calls the `raw_set_mode` function. The `raw_set_mode` calls the selected card's `set_mode` function, i.e. `card->set_mode(struct mode_info*)`. We can get access to this function from the `card_info` structure. Every video mode defines this structure with values filled depending upon the video mode (for example for `vga` it is the `video_vga.set_mode` function. See the above example of the `card_info` structure for `vga`). `video_vga.set_mode` is `vga_set_mode`, which checks the vga mode and calls the respective function:
+Let's take a look at the example of setting up [VGA](https://en.wikipedia.org/wiki/Video_Graphics_Array) video mode:
+
```C
static int vga_set_mode(struct mode_info *mode)
{
+ /* Set the basic mode */
vga_set_basic_mode();
+ /* Override a possibly broken BIOS */
force_x = mode->x;
force_y = mode->y;
@@ -268,53 +268,60 @@ static int vga_set_mode(struct mode_info *mode)
vga_set_80x60();
break;
}
+
return 0;
}
```
-Every function which sets up video mode just calls the `0x10` BIOS interrupt with a certain value in the `AH` register.
+The `vga_set_mode` function is responsible for configuring the VGA display to a specific text mode, based on the settings which we collected in the previous steps. The `vga_set_basic_mode` function resets the VGA hardware into a standard text mode. The next statement sets up the video mode based on the video mode that was selected. Most of these functions have very similar implementation based on the `0x10` BIOS interrupt.
-After we have set the video mode, we pass it to `boot_params.hdr.vid_mode`.
+After this step, the video mode is configured and we save all the information about it again for later use. Having done this, the video mode setup is complete and now we can take a look at the last preparation before we will see the switch into the protected mode.
-Next, `vesa_store_edid` is called. This function simply stores the [EDID](https://en.wikipedia.org/wiki/Extended_Display_Identification_Data) (**E**xtended **D**isplay **I**dentification **D**ata) information for kernel use. After this `store_mode_params` is called again. Lastly, if `do_restore` is set, the screen is restored to an earlier state.
+## Last preparation before transition into protected mode
-Having done this, the video mode setup is complete and now we can switch to the protected mode.
+Returning to the [`main`](https://github.com/torvalds/linux/blob/master/arch/x86/boot/main.c) function of the early kernel setup code, we finally can see:
-Last preparation before transition into protected mode
---------------------------------------------------------------------------------
+
+```C
+ /* Do the last things and invoke protected mode */
+ go_to_protected_mode();
+```
-We can see the last function call - `go_to_protected_mode` - in [arch/x86/boot/main.c](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/main.c). As the comment says: `Do the last things and invoke protected mode`, so let's see what these last things are and switch into protected mode.
+As the comment says: `Do the last things and invoke protected mode`, so let's see what these last things are and switch into protected mode.
-The `go_to_protected_mode` function is defined in [arch/x86/boot/pm.c](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/pm.c). It contains some functions which make the last preparations before we can jump into protected mode, so let's look at it and try to understand what it does and how it works.
+The `go_to_protected_mode` function is defined in [arch/x86/boot/pm.c](https://github.com/torvalds/linux/blob/master/arch/x86/boot/pm.c). It contains some routines which make the last preparations before we can jump into protected mode, so let's look at it and try to understand what it does and how it works.
-First is the call to the `realmode_switch_hook` function in `go_to_protected_mode`. This function invokes the real mode switch hook if it is present and disables [NMI](http://en.wikipedia.org/wiki/Non-maskable_interrupt). Hooks are used if the bootloader runs in a hostile environment. You can read more about hooks in the [boot protocol](https://www.kernel.org/doc/Documentation/x86/boot.txt) (see **ADVANCED BOOT LOADER HOOKS**).
+The very first function that we may see in the `go_to_protected_mode` is the `realmode_switch_hook` function. This function invokes the real mode switch hook if it is present or disables [NMI](http://en.wikipedia.org/wiki/Non-maskable_interrupt) otherwise. The hooks are used if the bootloader runs in a hostile environment. You can read more about hooks in the [boot protocol](https://www.kernel.org/doc/Documentation/x86/boot.txt) (see **ADVANCED BOOT LOADER HOOKS**). Interrupts must be disabled before switching to protected mode because otherwise the CPU could receive an interrupt when there is no valid interrupt table or handlers. Once the kernel will set up the protected-mode interrupt infrastructure, interrupts will be disabled again.
-The `realmode_switch` hook presents a pointer to the 16-bit real mode far subroutine which disables non-maskable interrupts. After the `realmode_switch` hook (it isn't present for me) is checked, Non-Maskable Interrupts(NMI) is disabled:
+We will consider only more-less standard use case, when the bootloader does not provide any hooks. So we just disable non-maskable interrupts:
+
```assembly
-asm volatile("cli");
-outb(0x80, 0x70); /* Disable NMI */
-io_delay();
+ asm volatile("cli");
+ outb(0x80, 0x70); /* Disable NMI */
+ io_delay();
```
-At first, there is an inline assembly statement with a `cli` instruction which clears the interrupt flag (`IF`). After this, external interrupts are disabled. The next line disables NMI (non-maskable interrupt).
-
-An interrupt is a signal to the CPU which is emitted by hardware or software. After getting such a signal, the CPU suspends the current instruction sequence, saves its state and transfers control to the interrupt handler. After the interrupt handler has finished its work, it transfers control back to the interrupted instruction. Non-maskable interrupts (NMI) are interrupts which are always processed, independently of permission. They cannot be ignored and are typically used to signal for non-recoverable hardware errors. We will not dive into the details of interrupts now but we will be discussing them in the coming posts.
+At the first line, there is an [inline assembly](../Theory/linux-theory-3.md) statement with the `cli` instruction which clears the [interrupt flag](https://en.wikipedia.org/wiki/Interrupt_flag). After this, external interrupts are disabled. The next line disables NMI (non-maskable interrupt). An interrupt is a signal to the CPU which is emitted by hardware or software. After getting such a signal, the CPU suspends the current instruction sequence, saves its state and transfers control to the interrupt handler. After the interrupt handler has finished its work, it transfers control back to the interrupted instruction. Non-maskable interrupts (NMI) are interrupts which are always processed, independently of permission. They cannot be ignored and are typically used to signal for non-recoverable hardware errors. We will not dive into the details of interrupts now but we will be discussing them in the next posts.
-Let's get back to the code. We can see in the second line that we are writing the byte `0x80` (disabled bit) to `0x70` (the CMOS Address register). After that, a call to the `io_delay` function occurs. `io_delay` causes a small delay and looks like:
+Let's get back to the code. We can see in the second line that we are writing the byte `0x0` to the port `0x80`. After that, a call to the `io_delay` function occurs. `io_delay` causes a small delay and looks like:
+
```C
static inline void io_delay(void)
{
const u16 DELAY_PORT = 0x80;
- asm volatile("outb %%al,%0" : : "dN" (DELAY_PORT));
+ outb(0, DELAY_PORT);
}
```
-To output any byte to the port `0x80` should delay exactly 1 microsecond. So we can write any value (the value from `AL` in our case) to the `0x80` port. After this delay the `realmode_switch_hook` function has finished execution and we can move to the next function.
+To output any byte to the port `0x80` should delay exactly 1 microsecond. This delay is needed to be sure that the change of the NMI mask has fully taken effect. After this delay, the `realmode_switch_hook` function has finished execution and we can be sure that all interrupts are disabled.
-The next function is `enable_a20`, which enables the [A20 line](http://en.wikipedia.org/wiki/A20_line). This function is defined in [arch/x86/boot/a20.c](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/a20.c) and it tries to enable the A20 gate with different methods. The first is the `a20_test_short` function which checks if A20 is already enabled or not with the `a20_test` function:
+The next step is the `enable_a20` function, which enables the [A20 line](http://en.wikipedia.org/wiki/A20_line). Enabling of this line allows kernel to have access above 1 MB.
+The `enable_a20` function is defined in [arch/x86/boot/a20.c](https://github.com/torvalds/linux/blob/master/arch/x86/boot/a20.c) and tries to enable the `A20` gate using the different approaches. The first is the `a20_test_short` function which checks if `A20` is already enabled or not using the `a20_test` function:
+
+
```C
static int a20_test(int loops)
{
@@ -339,46 +346,60 @@ static int a20_test(int loops)
}
```
-First of all, we put `0x0000` in the `FS` register and `0xffff` in the `GS` register. Next, we read the value at the address `A20_TEST_ADDR` (it is `0x200`) and put this value into the variables `saved` and `ctr`.
-
-Next, we write an updated `ctr` value into `fs:A20_TEST_ADDR` or `fs:0x200` with the `wrfs32` function, then delay for 1ms, and then read the value from the `GS` register into the address `A20_TEST_ADDR+0x10`. In a case when `a20` line is disabled, the address will be overlapped, in other case if it's not zero `a20` line is already enabled the A20 line.
+To verify whether the `A20` line is already enabled or not, the kernel performs a simple memory test. It begins by setting the `FS` register to `0x0000` and the `GS` register to `0xffff` values. By doing this, an access to `FS:0x200` (`A20_TEST_ADDR`) points into the very beginning of memory, while an access to `GS:0x2010` refers to a location just past the one-megabyte boundary. If the `A20` line is disabled, the latter will wrap around and point to the same physical address.
-If A20 is disabled, we try to enable it with a different method which you can find in `a20.c`. For example, it can be done with a call to the `0x15` BIOS interrupt with `AH=0x2041`.
+If the `A20` gate is disabled, the kernel will try to enable it using different methods which you can find in `enable_a20` function. For example, it can be done with a call to the `0x15` BIOS interrupt with `AH` register set to `0x2041`. If this function finished with a failure, print an error message and call the function `die` which will stop the process of the kernel setup.
-If the `enable_a20` function finished with a failure, print an error message and call the function `die`. You can remember it from the first source code file where we started - [arch/x86/boot/header.S](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/header.S):
+After the `A20` gate is successfully enabled, the `reset_coprocessor` function is called:
-```assembly
-die:
- hlt
- jmp die
- .size die, .-die
+
+```C
+static void reset_coprocessor(void)
+{
+ outb(0, 0xf0);
+ io_delay();
+ outb(0, 0xf1);
+ io_delay();
+}
```
-After the A20 gate is successfully enabled, the `reset_coprocessor` function is called:
+This function resets the [math coprocessor](https://en.wikipedia.org/wiki/Floating-point_unit) to be sure it is in a clean state by writing `0` to `0xF0` and then resets it by writing `0` to `0xF1`.
+
+The next step is the `mask_all_interrupts` function:
+
```C
-outb(0, 0xf0);
-outb(0, 0xf1);
+static void mask_all_interrupts(void)
+{
+ outb(0xff, 0xa1); /* Mask all interrupts on the secondary PIC */
+ io_delay();
+ outb(0xfb, 0x21); /* Mask all but cascade on the primary PIC */
+ io_delay();
+}
```
-This function clears the Math Coprocessor by writing `0` to `0xf0` and then resets it by writing `0` to `0xf1`.
+This function masks or in other words forbids all interrupts on the primary and secondary [PICs](https://en.wikipedia.org/wiki/Programmable_interrupt_controller). This is needed for safeness, we forbid all the interrupts from the `PIC` so nothing can interrupt the CPU while the kernel is doing transition into protected mode.
-After this, the `mask_all_interrupts` function is called:
+All the operations before this point, were executed for safe transition to the protected mode. The next operations will prepare the transition to the protected mode. Let's take a look at them.
-```C
-outb(0xff, 0xa1); /* Mask all interrupts on the secondary PIC */
-outb(0xfb, 0x21); /* Mask all but cascade on the primary PIC */
-```
+## Entering Protected Mode
+
+At this point, we are very close to see the switching into protected mode of the Linux kernel.
-This masks all interrupts on the secondary PIC (Programmable Interrupt Controller) and primary PIC except for IRQ2 on the primary PIC.
+Only two steps remain:
-And after all of these preparations, we can see the actual transition into protected mode.
+- Setting up the Interrupt Descriptor Table
+- Setting up the Global Descriptor Table
-Set up the Interrupt Descriptor Table
---------------------------------------------------------------------------------
+And that’s all! Once these two structures will be configured, the Linux kernel can make the jump into protected mode.
-Now we set up the Interrupt Descriptor table (IDT) in the `setup_idt` function:
+### Set up the Interrupt Descriptor Table
+Before the CPU can safely enter protected mode, it needs to know where to find the handlers that will be triggered in a case of [interrupts and exceptions](https://en.wikipedia.org/wiki/Interrupt). In real mode, the CPU relies on the [Interrupt Vector Table](https://en.wikipedia.org/wiki/Interrupt_vector_table). In the protected mode this mechanism changes to the Interrupt Descriptor Table.
+
+This is a special structure located in memory which contains descriptors that describes where CPU can find handlers for interrupts and exceptions. The full description of Interrupt Description Table and its entries we will see later, because for now we anyway disabled all the interrupts at the previous steps. Let's take a look at the function which setups zero filled Interrupt Descriptor Table:
+
+
```C
static void setup_idt(void)
{
@@ -387,8 +408,9 @@ static void setup_idt(void)
}
```
-which sets up the Interrupt Descriptor Table (describes interrupt handlers and etc.). For now, the IDT is not installed (we will see it later), but now we just load the IDT with the `lidtl` instruction. `null_idt` contains the address and size of the IDT, but for now they are just zero. `null_idt` is a `gdt_ptr` structure, it is defined as:
+As we may see, it just load the IDT which is filled with zero using the `lidtl` instruction. The `null_idt` has type `gdt_ptr` which is structure defined in the same source code file:
+
```C
struct gdt_ptr {
u16 len;
@@ -396,239 +418,172 @@ struct gdt_ptr {
} __attribute__((packed));
```
-where we can see the 16-bit length(`len`) of the IDT and the 32-bit pointer to it (More details about the IDT and interruptions will be seen in the next posts). ` __attribute__((packed))` means that the size of `gdt_ptr` is the minimum required size. So the size of the `gdt_ptr` will be 6 bytes here or 48 bits. (Next we will load the pointer to the `gdt_ptr` to the `GDTR` register and you might remember from the previous post that it is 48-bits in size).
+This structure provides information about the pointer to the Interrupt Descriptor Table.
-Set up Global Descriptor Table
---------------------------------------------------------------------------------
+### Set up Global Descriptor Table
-Next is the setup of the Global Descriptor Table (GDT). We can see the `setup_gdt` function which sets up the GDT (you can read about it in the post [Kernel booting process. Part 2.](linux-bootstrap-2.md#protected-mode)). There is a definition of the `boot_gdt` array in this function, which contains the definition of the three segments:
+The next is the setup of the Global Descriptor Table. As you may remember, the memory access is based on `segment:offset` addressing in real mode. The protected mode introduces the different model based on the `Global Descriptor Table`. If you forgot the details about the Global Description Table structure, you can find more information in the [previous chapter](./linux-bootstrap-2.md#protected-mode). Instead of fixed segment bases and limits, the CPU now looks for memory regions defined by descriptors located in the Global Descriptor Table. The goal of kernel is to setup these descriptors.
-```C
-static const u64 boot_gdt[] __attribute__((aligned(16))) = {
- [GDT_ENTRY_BOOT_CS] = GDT_ENTRY(0xc09b, 0, 0xfffff),
- [GDT_ENTRY_BOOT_DS] = GDT_ENTRY(0xc093, 0, 0xfffff),
- [GDT_ENTRY_BOOT_TSS] = GDT_ENTRY(0x0089, 4096, 103),
-};
-```
-
-for code, data and TSS (Task State Segment). We will not use the task state segment for now, it was added there to make Intel VT happy as we can see in the comment line (if you're interested you can find the commit which describes it - [here](https://github.com/torvalds/linux/commit/88089519f302f1296b4739be45699f06f728ec31)). Let's look at `boot_gdt`. First of all note that it has the `__attribute__((aligned(16)))` attribute. It means that this structure will be aligned by 16 bytes.
-
-Let's look at a simple example:
+All the job will be done by the `setup_gdt` function which is defined in the same source code file. Let's take a look at the definition of this function:
+
```C
-#include
-
-struct aligned {
- int a;
-}__attribute__((aligned(16)));
-
-struct nonaligned {
- int b;
-};
-
-int main(void)
+static void setup_gdt(void)
{
- struct aligned a;
- struct nonaligned na;
-
- printf("Not aligned - %zu \n", sizeof(na));
- printf("Aligned - %zu \n", sizeof(a));
-
- return 0;
+ /* There are machines which are known to not boot with the GDT
+ being 8-byte unaligned. Intel recommends 16 byte alignment. */
+ static const u64 boot_gdt[] __attribute__((aligned(16))) = {
+ /* CS: code, read/execute, 4 GB, base 0 */
+ [GDT_ENTRY_BOOT_CS] = GDT_ENTRY(DESC_CODE32, 0, 0xfffff),
+ /* DS: data, read/write, 4 GB, base 0 */
+ [GDT_ENTRY_BOOT_DS] = GDT_ENTRY(DESC_DATA32, 0, 0xfffff),
+ /* TSS: 32-bit tss, 104 bytes, base 4096 */
+ /* We only have a TSS here to keep Intel VT happy;
+ we don't actually use it for anything. */
+ [GDT_ENTRY_BOOT_TSS] = GDT_ENTRY(DESC_TSS32, 4096, 103),
+ };
+ /* Xen HVM incorrectly stores a pointer to the gdt_ptr, instead
+ of the gdt_ptr contents. Thus, make it static so it will
+ stay in memory, at least long enough that we switch to the
+ proper kernel GDT. */
+ static struct gdt_ptr gdt;
+
+ gdt.len = sizeof(boot_gdt)-1;
+ gdt.ptr = (u32)&boot_gdt + (ds() << 4);
+
+ asm volatile("lgdtl %0" : : "m" (gdt));
}
```
-Technically a structure which contains one `int` field must be 4 bytes in size, but an `aligned` structure will need 16 bytes to store in memory:
-
-```
-$ gcc test.c -o test && test
-Not aligned - 4
-Aligned - 16
-```
-
-The `GDT_ENTRY_BOOT_CS` has index - 2 here, `GDT_ENTRY_BOOT_DS` is `GDT_ENTRY_BOOT_CS + 1` and etc. It starts from 2, because the first is a mandatory null descriptor (index - 0) and the second is not used (index - 1).
-
-`GDT_ENTRY` is a macro which takes flags, base, limit and builds a GDT entry. For example, let's look at the code segment entry. `GDT_ENTRY` takes the following values:
-
-* base - 0
-* limit - 0xfffff
-* flags - 0xc09b
-
-What does this mean? The segment's base address is 0, and the limit (size of segment) is - `0xfffff` (1 MB). Let's look at the flags. It is `0xc09b` and it will be:
-
-```
-1100 0000 1001 1011
-```
-
-in binary. Let's try to understand what every bit means. We will go through all bits from left to right:
+The initial memory descriptors specified by the items of the `boot_gdt` array. The `setup_gdt` function just loads the pointer to the Global Descriptor Table filled with these items using the `lgdtl` instruction. Let's take a closer look at the memory descriptors definition.
-* 1 - (G) granularity bit
-* 1 - (D) if 0 16-bit segment; 1 = 32-bit segment
-* 0 - (L) executed in 64-bit mode if 1
-* 0 - (AVL) available for use by system software
-* 0000 - 4-bit length 19:16 bits in the descriptor
-* 1 - (P) segment presence in memory
-* 00 - (DPL) - privilege level, 0 is the highest privilege
-* 1 - (S) code or data segment, not a system segment
-* 101 - segment type execute/read/
-* 1 - accessed bit
+Initially, the 3 memory descriptors specified:
-You can read more about every bit in the previous [post](linux-bootstrap-2.md) or in the [Intel® 64 and IA-32 Architectures Software Developer's Manuals 3A](http://www.intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html).
+- Code segment
+- Memory segment
+- Task state segment
-After this we get the length of the GDT with:
-
-```C
-gdt.len = sizeof(boot_gdt)-1;
-```
+We will skip the description of the task state segment for now as it was added there to make [Intel VT](https://en.wikipedia.org/wiki/X86_virtualization#Intel_virtualization_(VT-x)) happy. The other two segments belongs to the memory for kernel code and data sections. Both memory descriptors defined using the `GDT_ENTRY` macro. This macro defined in the [arch/x86/include/asm/segment.h](https://github.com/torvalds/linux/blob/master/arch/x86/include/asm/segment.h) and expects to get three arguments:
-We get the size of `boot_gdt` and subtract 1 (the last valid address in the GDT).
+- `flags`
+- `base`
+- `limit`
-Next we get a pointer to the GDT with:
+Let's take a look at the definition of the code memory segment:
```C
-gdt.ptr = (u32)&boot_gdt + (ds() << 4);
+[GDT_ENTRY_BOOT_CS] = GDT_ENTRY(DESC_CODE32, 0, 0xfffff),
```
-Here we just get the address of `boot_gdt` and add it to the address of the data segment left-shifted by 4 bits (remember we're in real mode now).
+The base address of this memory segment is defined as `0` and limit as `0xFFFFF` or 1 Megabyte. The `DESC_CODE32` describes the flags of this segment. If we take a look at the flags, we will see that granularity (bit `G`) of this segment is set to 4 KB units. This means that the segment covers addresses `0x00000000–0xFFFFFFFF` - entire 4 GB linear address space. The same base address and limit will be defined for the data segment. It is done this way because Linux kernel using so-called [flat memory model](https://en.wikipedia.org/wiki/Flat_memory_model).
-Lastly we execute the `lgdtl` instruction to load the GDT into the GDTR register:
+Besides the granularity bit, the `DESC_CODE32` specifies other flags. Among them you can find, the this a 32-bit segment which is readable, executable and present in memory. The privilege level is set to the highest value as kernel needs.
-```C
-asm volatile("lgdtl %0" : : "m" (gdt));
-```
+Looking at the documentation of the Global Descriptor Table and its entries you can check all the initial segments by yourself. It is not so hard.
-Actual transition into protected mode
---------------------------------------------------------------------------------
+## Transition into protected mode
-This is the end of the `go_to_protected_mode` function. We loaded the IDT and GDT, disabled interrupts and now can switch the CPU into protected mode. The last step is calling the `protected_mode_jump` function with two parameters:
+We are standing right before it. Interrupts are disabled, the Interrupt Descriptor Table and Global Descriptor Table are initialized. Finally, the kernel can execute jump into protected mode. But despite good news, we need to return to assembly again 😅
-```C
-protected_mode_jump(boot_params.hdr.code32_start, (u32)&boot_params + (ds() << 4));
-```
-
-which is defined in [arch/x86/boot/pmjump.S](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/pmjump.S).
-
-It takes two parameters:
-
-* address of the protected mode entry point
-* address of `boot_params`
-
-Let's look inside `protected_mode_jump`. As I wrote above, you can find it in `arch/x86/boot/pmjump.S`. The first parameter will be in the `eax` register and the second one is in `edx`.
-
-First of all, we put the address of `boot_params` in the `esi` register and the address of the code segment register `cs` in `bx`.
+The transition to the protected mode we can find in the [arch/x86/boot/pmjump.S](https://github.com/torvalds/linux/blob/master/arch/x86/boot/pmjump.S). Let's take a look at it:
+
```assembly
-GLOBAL(protected_mode_jump)
- movl %edx, %esi # Pointer to boot_params table
+SYM_FUNC_START_NOALIGN(protected_mode_jump)
+ movl %edx, %esi # Pointer to boot_params table
xorl %ebx, %ebx
movw %cs, %bx
-```
-
-After this, we shift `bx` by 4 bits and add it to the memory location labeled `2` (which is `(cs << 4) + in_pm32`, the physical address to jump after transitioned to 32-bit mode) and jump to label `1`.
-
-```assembly
shll $4, %ebx
- addl %ebx, 2f # Add %ebx to the value stored at label 2
- jmp 1f # Short jump to serialize on 386/486
-```
-
-So after this `in_pm32` in label `2` will be overwritten with `(cs << 4) + in_pm32`.
-
-Next we put the data segment and the task state segment in the `cx` and `di` registers with:
+ addl %ebx, 2f
+ jmp 1f # Short jump to serialize on 386/486
+1:
-```assembly
movw $__BOOT_DS, %cx
movw $__BOOT_TSS, %di
-```
-As you can read above `GDT_ENTRY_BOOT_CS` has index 2 and every GDT entry is 8 byte, so `CS` will be `2 * 8 = 16`, `__BOOT_DS` is 24 etc.
-
-Next, we set the `PE` (Protection Enable) bit in the `CR0` control register:
-
-```assembly
movl %cr0, %edx
- orb $X86_CR0_PE, %dl
+ orb $X86_CR0_PE, %dl # Protected mode
movl %edx, %cr0
```
-and make a long jump to protected mode:
+First of all, we preserve the address of `boot_params` structure in the `esi` register. After this, we compute the real-mode segment base of the current code and add it to the value pointed to by the `2f` label which is the entry point to the protected mode. This is needed because as you remember at the previous step, the code memory segment starts from `0`, so the jump instruction must contain absolute linear address of the entry point.
+
+At the next steps we save the segment addresses of the data and task state in general purpose registers `cx` and `di` and set the `PE` bit in the control `cr0` register. From this point, the protected mode is turned on, and we need just to jump into it, to set proper value of the code segment:
+
```assembly
- .byte 0x66, 0xea
-2: .long in_pm32
- .word __BOOT_CS
+ # Transition to 32-bit mode
+ .byte 0x66, 0xea # ljmpl opcode
+2: .long .Lin_pm32 # offset
+ .word __BOOT_CS # segment
```
-where:
-
-* `0x66` is the operand-size prefix which allows us to mix 16-bit and 32-bit code
-* `0xea` - is the jump opcode
-* `in_pm32` is the segment offset under protect mode, which has value `(cs << 4) + in_pm32` derived from real mode
-* `__BOOT_CS` is the code segment we want to jump to.
-
-After this we are finally in protected mode:
+The kernel is in protected mode now 🥳🥳🥳
+
```assembly
-.code32
-.section ".text32","ax"
+ .code32
+ .section ".text32","ax"
+SYM_FUNC_START_LOCAL_NOALIGN(.Lin_pm32)
```
-Let's look at the first steps taken in protected mode. First of all we set up the data segment with:
+Let's look at the first steps taken in the protected mode. First of all we set up the data segment with the data segment address that we preserved in the `cx` register at the previous step:
+
```assembly
-movl %ecx, %ds
-movl %ecx, %es
-movl %ecx, %fs
-movl %ecx, %gs
-movl %ecx, %ss
+ # Set up data segments for flat 32-bit mode
+ movl %ecx, %ds
+ movl %ecx, %es
+ movl %ecx, %fs
+ movl %ecx, %gs
+ movl %ecx, %ss
```
-If you paid attention, you can remember that we saved `$__BOOT_DS` in the `cx` register. Now we fill it with all segment registers besides `cs` (`cs` is already `__BOOT_CS`).
-
-And setup a valid stack for debugging purposes:
+Since we are in the protected mode, our segment bases point to zero. Because of this, the stack pointer will point somewhere below the code, so we need to adjust it, at least for debugging purposes:
+
```assembly
-addl %ebx, %esp
+ addl %ebx, %esp
```
-The last step before the jump into 32-bit entry point is to clear the general purpose registers:
+The last step before the jump into actual 32-bit entry point is to clear the general purpose registers:
+
```assembly
-xorl %ecx, %ecx
-xorl %edx, %edx
-xorl %ebx, %ebx
-xorl %ebp, %ebp
-xorl %edi, %edi
+ xorl %ecx, %ecx
+ xorl %edx, %edx
+ xorl %ebx, %ebx
+ xorl %ebp, %ebp
+ xorl %edi, %edi
```
-And jump to the 32-bit entry point in the end:
+Now everything is ready. The kernel is in the protected mode and we can jump to the next code, address of which was passed in the `eax` register:
+
+```assembly
+ jmpl *%eax # Jump to the 32-bit entrypoint
```
-jmpl *%eax
-```
-
-Remember that `eax` contains the address of the 32-bit entry (we passed it as the first parameter into `protected_mode_jump`).
-
-That's all. We're in protected mode and stop at its entry point. We will see what happens next in the next part.
-
-Conclusion
---------------------------------------------------------------------------------
-This is the end of the third part about linux kernel insides. In the next part, we will look at the first steps we take in protected mode and transition into [long mode](http://en.wikipedia.org/wiki/Long_mode).
+## Conclusion
-If you have any questions or suggestions write me a comment or ping me at [twitter](https://twitter.com/0xAX).
+This is the end of the third part about Linux kernel insides. If you have questions or suggestions, feel free ping me on X - [0xAX](https://twitter.com/0xAX), drop me an [email](mailto:anotherworldofworld@gmail.com), or just create an [issue](https://github.com/0xAX/linux-insides/issues/new).
-**Please note that English is not my first language, And I am really sorry for any inconvenience. If you find any mistakes, please send me a PR with corrections at [linux-insides](https://github.com/0xAX/linux-internals).**
+## Links
-Links
---------------------------------------------------------------------------------
+Here is the list of the links that you may find useful during reading of this chapter:
-* [VGA](http://en.wikipedia.org/wiki/Video_Graphics_Array)
-* [VESA BIOS Extensions](http://en.wikipedia.org/wiki/VESA_BIOS_Extensions)
-* [Data structure alignment](http://en.wikipedia.org/wiki/Data_structure_alignment)
-* [Non-maskable interrupt](http://en.wikipedia.org/wiki/Non-maskable_interrupt)
-* [A20](http://en.wikipedia.org/wiki/A20_line)
-* [GCC designated inits](https://gcc.gnu.org/onlinedocs/gcc-4.1.2/gcc/Designated-Inits.html)
-* [GCC type attributes](https://gcc.gnu.org/onlinedocs/gcc/Type-Attributes.html)
-* [Previous part](linux-bootstrap-2.md)
+- [QEMU](https://www.qemu.org/)
+- [VGA](http://en.wikipedia.org/wiki/Video_Graphics_Array)
+- [VESA BIOS Extensions](http://en.wikipedia.org/wiki/VESA_BIOS_Extensions)
+- [Data structure alignment](http://en.wikipedia.org/wiki/Data_structure_alignment)
+- [Non-maskable interrupt](http://en.wikipedia.org/wiki/Non-maskable_interrupt)
+- [A20](http://en.wikipedia.org/wiki/A20_line)
+- [Math coprocessor](https://en.wikipedia.org/wiki/Floating-point_unit)
+- [PIC](https://en.wikipedia.org/wiki/Programmable_interrupt_controller)
+- [Interrupts and exceptions](https://en.wikipedia.org/wiki/Interrupt)
+- [Interrupt Vector Table](https://en.wikipedia.org/wiki/Interrupt_vector_table)
+- [Protected mode](https://en.wikipedia.org/wiki/Protected_mode)
+- [Intel VT](https://en.wikipedia.org/wiki/X86_virtualization#Intel_virtualization_(VT-x))
+- [Flat memory model](https://en.wikipedia.org/wiki/Flat_memory_model)
+- [Previous part](linux-bootstrap-2.md)
diff --git a/Booting/linux-bootstrap-4.md b/Booting/linux-bootstrap-4.md
index 87f6f273..52d1a7d9 100644
--- a/Booting/linux-bootstrap-4.md
+++ b/Booting/linux-bootstrap-4.md
@@ -1,26 +1,50 @@
-Kernel booting process. Part 4.
-================================================================================
+# Kernel booting process. Part 4.
-The Transition to 64-bit mode
---------------------------------------------------------------------------------
+In the previous [part](./linux-bootstrap-3.md), we saw the transition from the [real mode](https://en.wikipedia.org/wiki/Real_mode) into [protected mode](http://en.wikipedia.org/wiki/Protected_mode). At this point, the two crucial things were changed - the processor can address up to 4 gigabytes of memory and the privilege levels were set for the memory access. Despite this, the kernel is still in its early setup mode. There are many different things that has to be prepared and configured before we will reach the main kernel's entry point. Since we are learning the Linux kernel for `x86_64` processors, the protected mode is not the main mode where the processor should operate. The next crucial step is to switch to the native mode for `x86_64` - [long mode](https://en.wikipedia.org/wiki/Long_mode).
-This is the fourth part of the `Kernel booting process`. Here, we will learn about the first steps taken in [protected mode](http://en.wikipedia.org/wiki/Protected_mode), like checking if the CPU supports [long mode](http://en.wikipedia.org/wiki/Long_mode) and [SSE](http://en.wikipedia.org/wiki/Streaming_SIMD_Extensions). We will initialize the page tables with [paging](http://en.wikipedia.org/wiki/Paging) and, at the end, transition the CPU to [long mode](https://en.wikipedia.org/wiki/Long_mode).
+The main characteristic of this new mode, as with all the earlier modes - the way it defines the memory model. In real mode, the memory model was relatively simple and each memory location was formed based on the base address specified in a segment register and plus some offset. In protected mode, introduced Global and Local descriptor table with descriptors which describe memory areas. All the memory accesses in long mode are based on the new mechanism called [paging](https://en.wikipedia.org/wiki/Memory_paging). One of the crucial goal of the kernel before it can switch to the long mode is to setup paging. This and all other details needed to switch to long mode we will see in this chapter.
-**NOTE: there will be lots of assembly code in this part, so if you are not familiar with that, you might want to consult a book about it**
+> [!NOTE]
+> There will be lots of assembly code in this part, so if you are not familiar with that, you might want to consult a book about it or read another set of my [posts](https://github.com/0xAX/asm) about assembly programming.
-In the previous [part](https://github.com/0xAX/linux-insides/blob/v4.16/Booting/linux-bootstrap-3.md) we stopped at the jump to the `32-bit` entry point in [arch/x86/boot/pmjump.S](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/pmjump.S):
+## The 32-bit kernel entry point location
+The last point where we stopped was the [jump](https://en.wikipedia.org/wiki/Branch_(computer_science)#Implementation) to the kernel's entry point in protected mode. This jump is defined in the [arch/x86/boot/pmjump.S](https://github.com/torvalds/linux/blob/master/arch/x86/boot/pmjump.S) and looks like this:
+
+
```assembly
-jmpl *%eax
+ jmpl *%eax # Jump to the 32-bit entrypoint
+```
+
+The value of the `eax` register contains the address of the `32-bit` entry point. What is this address? To answer on this question, we can read the [Linux kernel x86 boot protocol](https://www.kernel.org/doc/Documentation/x86/boot.txt) document:
+
+> When using bzImage, the protected-mode kernel was relocated to 0x100000
+
+We can make make sure that this 32-bit entry point of the Linux kernel using the [GNU GDB](https://sourceware.org/gdb/) debugger and running the Linux kernel in the [QEMU](https://www.qemu.org/) virtual machine. To do this, you can run the following command in one terminal:
+
+```bash
+sudo qemu-system-x86_64 -kernel ./linux/arch/x86/boot/bzImage \
+ -nographic \
+ -append "console=ttyS0 nokaslr" -s -S \
+ -initrd /boot/initramfs-6.17.0-rc3-g1b237f190eb3.img
```
-You will recall that the `eax` register contains the address of the 32-bit entry point. We can read about this in the [linux kernel x86 boot protocol](https://www.kernel.org/doc/Documentation/x86/boot.txt):
+> [!NOTE]
+> You need to pass your own kernel image and [initrd](https://en.wikipedia.org/wiki/Initial_ramdisk) image to the `-kernel` and `-initrd` command line options.
+
+After this, run the GNU GDB debugger in another terminal and pass the following commands:
```
-When using bzImage, the protected-mode kernel was relocated to 0x100000
+$ gdb
+(gdb) target remote :1234
+(gdb) hbreak *0x100000
+(gdb) c
+Continuing.
+
+Breakpoint 1, 0x0000000000100000 in ?? ()
```
-Let's make sure that this is so by looking at the register values at the 32-bit entry point:
+As soon as the debugger stopped at the [breakpoint](https://en.wikipedia.org/wiki/Breakpoint), we can inspect registers to be sure that the `eax` register contains the `0x100000` - address of the 32-bit kernel entry point:
```
eax 0x100000 1048576
@@ -33,51 +57,92 @@ esi 0x14470 83056
edi 0x0 0
eip 0x100000 0x100000
eflags 0x46 [ PF ZF ]
-cs 0x10 16
-ss 0x18 24
-ds 0x18 24
-es 0x18 24
-fs 0x18 24
-gs 0x18 24
```
-We can see here that the `cs` register contains a value of `0x10` (as you might recall from the [previous part](https://github.com/0xAX/linux-insides/blob/v4.16/Booting/linux-bootstrap-3.md), this is the second index in the `Global Descriptor Table`), the `eip` register contains the value `0x100000` and the base address of all segments including the code segment are zero.
+From the previous part, you may remember:
+
+> First of all, we preserve the address of `boot_params` structure in the `esi` register.
+
+So the `esi` register has the pointer to the `boot_params`. Let's inspect it to make sure that it is really it. For example we can take a look at the command line string that we passed to the virtual machine:
+
+```
+(gdb) x/s ((struct boot_params *)$rsi)->hdr.cmd_line_ptr
+0x20000: "console=ttyS0 nokaslr"
+(gdb) ptype struct boot_params
+type = struct boot_params {
+ struct screen_info screen_info;
+ struct apm_bios_info apm_bios_info;
+ __u8 _pad2[4];
+ __u64 tboot_addr;
+ struct ist_info ist_info;
+ __u64 acpi_rsdp_addr;
+ __u8 _pad3[8];
+ __u8 hd0_info[16];
+ __u8 hd1_info[16];
+ struct sys_desc_table sys_desc_table;
+ struct olpc_ofw_header olpc_ofw_header;
+ __u32 ext_ramdisk_image;
+ __u32 ext_ramdisk_size;
+ __u32 ext_cmd_line_ptr;
+ __u8 _pad4[112];
+ __u32 cc_blob_address;
+ struct edid_info edid_info;
+ struct efi_info efi_info;
+ __u32 alt_mem_k;
+ __u32 scratch;
+ __u8 e820_entries;
+ __u8 eddbuf_entries;
+ __u8 edd_mbr_sig_buf_entries;
+ __u8 kbd_status;
+ __u8 secure_boot;
+ __u8 _pad5[2];
+ __u8 sentinel;
+ __u8 _pad6[1];
+ struct setup_header hdr;
+ __u8 _pad7[36];
+ __u32 edd_mbr_sig_buffer[16];
+ struct boot_e820_entry e820_table[128];
+ __u8 _pad8[48];
+ struct edd_info eddbuf[6];
+ __u8 _pad9[276];
+}
+(gdb) x/s ((struct boot_params *)$rsi)->hdr.cmd_line_ptr
+0x20000: "console=ttyS0 nokaslr"
+```
+
+We got it 🎉
-So, the physical address where the kernel is loaded would be `0:0x100000` or just `0x100000`, as specified by the boot protocol. Now let's start with the `32-bit` entry point.
+Now we know where we are, so let's take a look at the code and proceed with learning of the Linux kernel.
-The 32-bit entry point
---------------------------------------------------------------------------------
+## First steps in the protected mode
-The `32-bit` entry point is defined in the [arch/x86/boot/compressed/head_64.S](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/compressed/head_64.S) assembly source code file:
+The `32-bit` entry point is defined in the [arch/x86/boot/compressed/head_64.S](https://github.com/torvalds/linux/blob/master/arch/x86/boot/compressed/head_64.S) assembly source code file:
+
```assembly
- __HEAD
.code32
-ENTRY(startup_32)
-....
-....
-....
-ENDPROC(startup_32)
+SYM_FUNC_START(startup_32)
```
-First, why is the directory named `compressed`? The answer to that is that `bzimage` is a gzipped package consisting of `vmlinux`, `header` and ` kernel setup code`. We looked at kernel setup code in all of the previous parts. The main goal of the code in `head_64.S` is to prepare to enter long mode, enter it and then decompress the kernel. We will look at all of the steps leading to kernel decompression in this part.
+First of all, it is worth to know is the directory named `compressed`? The answer to that is that the kernel is in the [`bzImage`](https://en.wikipedia.org/wiki/Vmlinux#bzImage) file. This file is a compressed package consisting of kernel image and the kernel setup code. In all previous chapters we were researching the kernel setup code. The next two big steps which remaining before we will see the entrypoint of the kernel are:
+
+- switch to long mode
+- decompress the kernel image and jump to its entrypoint
-You will find two files in the `arch/x86/boot/compressed` directory:
+In this part we will focus at the first big step and the steps leading to the kernel decompression and the decompression itself we will see in the next chapters. Returning to the current kernel code, you may find the two following files in the [arch/x86/boot/compressed](https://github.com/torvalds/linux/tree/master/arch/x86/boot/compressed) directory:
-* [head_32.S](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/compressed/head_32.S)
-* [head_64.S](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/compressed/head_64.S)
+- [head_32.S](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/compressed/head_32.S)
+- [head_64.S](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/compressed/head_64.S)
-but we will consider only the `head_64.S` source code file because, as you may remember, this book is only `x86_64` related; Let's look at [arch/x86/boot/compressed/Makefile](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/compressed/Makefile). We can find the following `make` target here:
+In our case, we will consider only the `head_64.S` file. Yes, the file named with the `64` suffix despite the kernel is in the 32-bit protected mode for this moment. The explanation for this situation is simple. Let's look at [arch/x86/boot/compressed/Makefile](https://github.com/torvalds/linux/blob/master/arch/x86/boot/compressed/Makefile). We may see the following `make` goal here:
```Makefile
-vmlinux-objs-y := $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \
- $(obj)/string.o $(obj)/cmdline.o \
+vmlinux-objs-y := $(obj)/vmlinux.lds $(obj)/kernel_info.o $(obj)/head_$(BITS).o \
+ $(obj)/misc.o $(obj)/string.o $(obj)/cmdline.o $(obj)/error.o \
$(obj)/piggy.o $(obj)/cpuflags.o
```
-The first line contains this- `$(obj)/head_$(BITS).o`.
-
-This means that we will select which file to link based on what `$(BITS)` is set to, either `head_32.o` or `head_64.o`. The `$(BITS)` variable is defined elsewhere in [arch/x86/Makefile](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/Makefile) based on the kernel configuration:
+The first line contains this the following target - `$(obj)/head_$(BITS).o`. This means, that `make` will select the file during kernel build process based on the value of the `$(BITS)`. This `make` variable defined in the [arch/x86/Makefile](https://github.com/torvalds/linux/blob/master/arch/x86/Makefile) make file and its value depends on the kernel configuration:
```Makefile
ifeq ($(CONFIG_X86_32),y)
@@ -91,259 +156,187 @@ else
endif
```
-Now that we know where to start, let's get to it.
+Since we are consider the kernel for `x86_64` architecture, we assume that the `CONFIG_X86_64` is set to `y`. As the result, the `head_64.S` file will be used during the kernel build process. Let's start to investigate this what the kernel does in this file.
+
+### Reload the segments if needed
-Reload the segments if needed
---------------------------------------------------------------------------------
+As we already know, our start is in the [arch/x86/boot/compressed/head_64.S](https://github.com/torvalds/linux/blob/master/arch/x86/boot/compressed/head_64.S) assembly source code file. The entry point is defined by the `startup_32` symbol.
-As indicated above, we start in the [arch/x86/boot/compressed/head_64.S](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/boot/compressed/head_64.S) assembly source code file. We first see the definition of a special section attribute before the definition of the `startup_32` function:
+In the beginning of the `startup_32`, we can see the `cld` instruction which clears the `DF` or [direction flag](https://en.wikipedia.org/wiki/Direction_flag) bit in the [flags](https://en.wikipedia.org/wiki/FLAGS_register) register:
+
```assembly
- __HEAD
- .code32
-ENTRY(startup_32)
+ .code32
+SYM_FUNC_START(startup_32)
+ /*
+ * 32bit entry is 0 and it is ABI so immutable!
+ * If we come here directly from a bootloader,
+ * kernel(text+data+bss+brk) ramdisk, zero_page, command line
+ * all need to be under the 4G limit.
+ */
+ cld
```
-`__HEAD` is a macro defined in the [include/linux/init.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/init.h) header file and expands to the definition of the following section:
+When the direction flag is clear, all string operations which usually used for copying data, like for example [stos](http://x86.renejeschke.de/html/file_module_x86_id_306.html), [scas](http://x86.renejeschke.de/html/file_module_x86_id_287.html) and others, will increment the index registers `esi` or `edi`. We need to clear the direction flag because later we will use strings operations to perform various operations such as clearing space for page tables or copying data.
-```C
-#define __HEAD .section ".head.text","ax"
-```
+The next instruction is to disable interrupts - `cli`. We already have seen it in previous chapter. The interrupts are disabled "twice" because modern bootloaders can load the kernel starting from this point but not only one that we have seen in the [first chapter](./linux-bootstrap-1.md).
-Here, `.head.text` is the name of the section and `ax` is a set of flags. In our case, these flags show us that this section is [executable](https://en.wikipedia.org/wiki/Executable) or in other words contains code. We can find the definition of this section in the [arch/x86/boot/compressed/vmlinux.lds.S](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/boot/compressed/vmlinux.lds.S) linker script:
+After these two simple instructions, the next step is to calculate the difference between where the kernel is compiled to run, and where it actually was loaded. If we will take a look at the linker [script](https://github.com/torvalds/linux/blob/master/arch/x86/boot/compressed/vmlinux.lds.S), we will see the following definition:
-```
+
+```linker-script
SECTIONS
{
+ /* Be careful parts of head_64.S assume startup_32 is at
+ * address 0.
+ */
. = 0;
- .head.text : {
- _head = . ;
- HEAD_TEXT
- _ehead = . ;
- }
- ...
- ...
- ...
-}
```
-If you are not familiar with the syntax of the `GNU LD` linker scripting language, you can find more information in its [documentation](https://sourceware.org/binutils/docs/ld/Scripts.html#Scripts). In short, the `.` symbol is a special linker variable, the location counter. The value assigned to it is an offset relative to the segment. In our case, we set the location counter to zero. This means that our code is linked to run from an offset of `0` in memory. This is also stated in the comments:
+This means that the code in this section is compiled to run at the address zero. We also can see this in the output of `objdump` utility:
-```
-Be careful parts of head_64.S assume startup_32 is at address 0.
-```
+```bash
+$ objdump -D /home/alex/disk/dev/linux/arch/x86/boot/compressed/vmlinux | less
-Now that we have our bearings, let's look at the contents of the `startup_32` function.
+/home/alex/disk/dev/linux/arch/x86/boot/compressed/vmlinux: file format elf64-x86-64
-In the beginning of the `startup_32` function, we can see the `cld` instruction which clears the `DF` bit in the [flags](https://en.wikipedia.org/wiki/FLAGS_register) register. When the direction flag is clear, all string operations like [stos](http://x86.renejeschke.de/html/file_module_x86_id_306.html), [scas](http://x86.renejeschke.de/html/file_module_x86_id_287.html) and others will increment the index registers `esi` or `edi`. We need to clear the direction flag because later we will use strings operations to perform various operations such as clearing space for page tables.
-After we have cleared the `DF` bit, the next step is to check the `KEEP_SEGMENTS` flag in the `loadflags` kernel setup header field. If you remember, we already talked about `loadflags` in the very first [part](https://0xax.gitbook.io/linux-insides/summary/booting/linux-bootstrap-1) of this book. There we checked the `CAN_USE_HEAP` flag to query the ability to use the heap. Now we need to check the `KEEP_SEGMENTS` flag. This flag is described in the linux [boot protocol](https://www.kernel.org/doc/Documentation/x86/boot.txt) documentation:
+Disassembly of section .head.text:
-```
-Bit 6 (write): KEEP_SEGMENTS
- Protocol: 2.07+
- - If 0, reload the segment registers in the 32bit entry point.
- - If 1, do not reload the segment registers in the 32bit entry point.
- Assume that %cs %ds %ss %es are all set to flat segments with
- a base of 0 (or the equivalent for their environment).
+0000000000000000 :
+ 0: fc cld
+ 1: fa cli
```
-So, if the `KEEP_SEGMENTS` bit is not set in `loadflags`, we need to set the `ds`, `ss` and `es` segment registers to the index of the data segment with a base of `0`. That we do:
+We may see that and the linker script and the `objdump` utility tells us that the address of the `startup_32` function is `0` but it is not where the kernel was loaded. This is only the address that the code was compiled for, also called link-time address. Why it was done like that? The answer is for simplicity. By telling the linker to set the address of the very first symbol to zero, each next symbol becomes a simple offset from 0. As we already know, the kernel was loaded at the `0x100000` address. The difference between this address and zero called relocation delta. Once that delta is known, the code can reach any variable or function by adding this delta to their compile-time addresses.
-```C
- testb $KEEP_SEGMENTS, BP_loadflags(%esi)
- jnz 1f
+We know these addresses and as the result the value of delta based on experiment we have seen above. Now let's take a look how the kernel calculates this difference:
- cli
- movl $(__BOOT_DS), %eax
- movl %eax, %ds
- movl %eax, %es
- movl %eax, %ss
+
+```assembly
+ leal (BP_scratch+4)(%esi), %esp
+ call 1f
+1: popl %ebp
+ subl $ rva(1b), %ebp
```
-Remember that `__BOOT_DS` is `0x18` (the index of the data segment in the [Global Descriptor Table](https://en.wikipedia.org/wiki/Global_Descriptor_Table)). If `KEEP_SEGMENTS` is set, we jump to the nearest `1f` label or update segment registers with `__BOOT_DS` if they are not set. This is all pretty easy, but here's something to consider. If you've read the previous [part](https://github.com/0xAX/linux-insides/blob/v4.16/Booting/linux-bootstrap-3.md), you may remember that we already updated these segment registers right after we switched to [protected mode](https://en.wikipedia.org/wiki/Protected_mode) in [arch/x86/boot/pmjump.S](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/pmjump.S). So why do we need to care about the values in the segment registers again? The answer is easy. The Linux kernel also has a 32-bit boot protocol and if a bootloader uses *that* to load the Linux kernel, all the code before the `startup_32` function will be missed. In this case, the `startup_32` function would be the first entry point to the Linux kernel right after the bootloader and there are no guarantees that the segment registers will be in a known state.
-
-After we have checked the `KEEP_SEGMENTS` flag and set the segment registers to a correct value, the next step is to calculate the difference between where the kernel is compiled to run, and where we loaded it. Remember that `setup.ld.S` contains the following definition: `. = 0` at the start of the `.head.text` section. This means that the code in this section is compiled to run at the address `0`. We can see this in the output of `objdump`:
-
-```
-arch/x86/boot/compressed/vmlinux: file format elf64-x86-64
+The `call` instruction is used to get the real address of the kernel. This trick works because after the `call` instruction is executed, the stack should have return address on its top. In the code above we setup a temporary mini stack to get the address of the kernel and execute the call to the nearest label `1`. Since the top of the stack contains the return address, we put it into the `ebp` register. Using the last instruction we subtract the difference between the address of the label `1` and `strtup_32` address from the return address that we got at the previous step:
+
-Disassembly of section .head.text:
+Starting from this moment, the `ebp` register will contain the address of the beginning of the kernel image and using it we can calculate offset to any other symbols or structures in memory. And the first such structure that we will access is the Global Descriptor Table. To switch to long mode, we need to update the previously loaded Global Descriptor Table with `64-bit` segments:
-0000000000000000 :
- 0: fc cld
- 1: f6 86 11 02 00 00 40 testb $0x40,0x211(%rsi)
+
+```assembly
+ leal rva(gdt)(%ebp), %eax
+ movl %eax, 2(%eax)
+ lgdt (%eax)
```
-The `objdump` util tells us that the address of the `startup_32` function is `0` but that isn't so. We now need to know where we actually are. This is pretty simple to do in [long mode](https://en.wikipedia.org/wiki/Long_mode) because it supports `rip` relative addressing, but currently we are in [protected mode](https://en.wikipedia.org/wiki/Protected_mode). We will use a common pattern to find the address of the `startup_32` function. We need to define a label, make a call to it and pop the top of the stack to a register:
+Where the new Global Descriptor Table is:
+
```assembly
-call label
-label: pop %reg
+SYM_DATA_START_LOCAL(gdt)
+ .word gdt_end - gdt - 1
+ .long 0
+ .word 0
+ .quad 0x00cf9a000000ffff /* __KERNEL32_CS */
+ .quad 0x00af9a000000ffff /* __KERNEL_CS */
+ .quad 0x00cf92000000ffff /* __KERNEL_DS */
+ .quad 0x0080890000000000 /* TS descriptor */
+ .quad 0x0000000000000000 /* TS continued */
+SYM_DATA_END_LABEL(gdt, SYM_L_LOCAL, gdt_end)
```
-After this, the register indicated by `%reg` will contain the address of `label`. Let's look at the code which uses this pattern to search for the `startup_32` function in the Linux kernel:
+The new Global Descriptor table contains five descriptors:
-```assembly
- leal (BP_scratch+4)(%esi), %esp
- call 1f
-1: popl %ebp
- subl $1b, %ebp
-```
+- `32-bit` kernel code segment
+- `64-bit` kernel code segment
+- `32-bit` kernel data segment
+- Task state descriptor
+- Second task state descriptor
-As you remember from the previous part, the `esi` register contains the address of the [boot_params](https://github.com/torvalds/linux/blob/v4.16/arch/x86/include/uapi/asm/bootparam.h#L113) structure which was filled before we moved to the protected mode. The `boot_params` structure contains a special field `scratch` with an offset of `0x1e4`. This four byte field is a temporary stack for the `call` instruction. We set `esp` to the address four bytes after the `BP_scratch` field of the `boot_params` structure. We add `4` bytes to the base of the `BP_scratch` field because, as just described, it will be a temporary stack and the stack grows from the top to bottom in the `x86_64` architecture. So our stack pointer will point to the top of the temporary stack. Next, we can see the pattern that I've described above. We make a call to the `1f` label and pop the top of the stack onto `ebp`. This works because `call` stores the return address of the current function on the top of the stack. We now have the address of the `1f` label and can now easily get the address of the `startup_32` function. We just need to subtract the address of the label from the address we got from the stack:
+We already saw the loading the Global Descriptor Table in the previous [part](./linux-bootstrap-3.md#set-up-global-descriptor-table), and now we're doing almost the same here, but we set descriptors to use `CS.L = 1` and `CS.D = 0` for execution in 64 bit mode.
-```
-startup_32 (0x0) +-----------------------+
- | |
- | |
- | |
- | |
- | |
- | |
- | |
- | |
-1f (0x0 + 1f offset) +-----------------------+ %ebp - real physical address
- | |
- | |
- +-----------------------+
-```
+After the new Global Descriptor Table is loaded, the kernel can setup the new stack:
-The `startup_32` function is linked to run at the address `0x0` and this means that `1f` has the address `0x0 + offset to 1f`, which is approximately `0x21` bytes. The `ebp` register contains the real physical address of the `1f` label. So, if we subtract `1f` from the `ebp` register, we will get the real physical address of the `startup_32` function. The Linux kernel [boot protocol](https://www.kernel.org/doc/Documentation/x86/boot.txt) says the base of the protected mode kernel is `0x100000`. We can verify this with [gdb](https://en.wikipedia.org/wiki/GNU_Debugger). Let's start the debugger and add a breakpoint at the address of `1f`, which is `0x100021`. If this is correct we will see the value `0x100021` in the `ebp` register:
+
+```assembly
+ movl $__BOOT_DS, %eax
+ movl %eax, %ds
+ movl %eax, %es
+ movl %eax, %fs
+ movl %eax, %gs
+ movl %eax, %ss
+ /* Setup a stack and load CS from current GDT */
+ leal rva(boot_stack_end)(%ebp), %esp
```
-$ gdb
-(gdb)$ target remote :1234
-Remote debugging using :1234
-0x0000fff0 in ?? ()
-(gdb)$ br *0x100022
-Breakpoint 1 at 0x100022
-(gdb)$ c
-Continuing.
-Breakpoint 1, 0x00100022 in ?? ()
-(gdb)$ i r
-eax 0x18 0x18
-ecx 0x0 0x0
-edx 0x0 0x0
-ebx 0x0 0x0
-esp 0x144a8 0x144a8
-ebp 0x100021 0x100021
-esi 0x142c0 0x142c0
-edi 0x0 0x0
-eip 0x100022 0x100022
-eflags 0x46 [ PF ZF ]
-cs 0x10 0x10
-ss 0x18 0x18
-ds 0x18 0x18
-es 0x18 0x18
-fs 0x18 0x18
-gs 0x18 0x18
-```
+At the previous step we loaded new Global Descriptor Table, but all the segment registers may have selectors from old table. If those selectors point to invalid entries in the new Global Descriptor Table, next memory access can cause [General Protection Fault](https://en.wikipedia.org/wiki/General_protection_fault). Setting them to the `__BOOT_DS` which is a known-good descriptor should fix this potential fault and allow us to set proper stack pointed by the `boot_stack_end`.
-If we execute the next instruction, `subl $1b, %ebp`, we will see:
+The last action after we loaded the new Global Descriptor Table is to reload `cs` descriptor:
-```
-(gdb) nexti
-...
-...
-...
-ebp 0x100000 0x100000
-...
-...
-...
+
+```assembly
+ pushl $__KERNEL32_CS
+ leal rva(1f)(%ebp), %eax
+ pushl %eax
+ lretl
+1:
```
-Ok, we've verified that the address of the `startup_32` function is `0x100000`. After we know the address of the `startup_32` label, we can prepare for the transition to [long mode](https://en.wikipedia.org/wiki/Long_mode). Our next goal is to setup the stack and verify that the CPU supports long mode and [SSE](http://en.wikipedia.org/wiki/Streaming_SIMD_Extensions).
+Since we can not change segment registers using simple `mov` instruction, we need to apply a trick with the `lretl` instruction. This instruction fetches the two values from the top of the stack and put the first value into the `eip` register and the second value to the `cs` register. Since this moment we have proper kernel code selector and instruction pointer values.
-Stack setup and CPU verification
---------------------------------------------------------------------------------
+Just a couple of steps separate us from the transition into the long mode. As it was mentioned in the beginning of this chapter, one of the most crucial is to setup `paging`. But before this task, the kernel needs to do last preparations which we will see in the next sections.
-We can't set up the stack until we know where in memory the `startup_32` label is. If we imagine the stack as an array, the stack pointer register `esp` must point to the end of it. Of course, we can define an array in our code, but we need to know its actual address to configure the stack pointer correctly. Let's look at the code:
+## Last steps before paging setup
-```assembly
- movl $boot_stack_end, %eax
- addl %ebp, %eax
- movl %eax, %esp
-```
+As we mentioned in the previous section, there a couple of additional steps before we can setup paging and switch to long mode. These steps are:
-The `boot_stack_end` label is also defined in the [arch/x86/boot/compressed/head_64.S](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/compressed/head_64.S) assembly source code file and is located in the [.bss](https://en.wikipedia.org/wiki/.bss) section:
+- Verification of CPU
+- Calculation of the relocation address
+- Enabling `PAE` mode
-```assembly
- .bss
- .balign 4
-boot_heap:
- .fill BOOT_HEAP_SIZE, 1, 0
-boot_stack:
- .fill BOOT_STACK_SIZE, 1, 0
-boot_stack_end:
-```
+In the next sections we will take a look at these steps.
-First of all, we put the address of `boot_stack_end` into the `eax` register, so the `eax` register contains the address of `boot_stack_end` as it was linked, which is `0x0 + boot_stack_end`. To get the real address of `boot_stack_end`, we need to add the real address of the `startup_32` function. We've already found this address and put it into the `ebp` register. In the end, the `eax` register will contain the real address of `boot_stack_end` and we just need to set the stack pointer to it.
+### CPU verification
-After we have set up the stack, the next step is CPU verification. Since we are transitioning to `long mode`, we need to check that the CPU supports `long mode` and `SSE`. We will do this with a call to the `verify_cpu` function:
+Before we the kernel can switch to long mode, it needs to check that it runs on the suitable `x86_64` processor. This is done by the next piece of code:
+
```assembly
+ /* Make sure cpu supports long mode. */
call verify_cpu
testl %eax, %eax
- jnz no_longmode
+ jnz .Lno_longmode
```
-This function is defined in the [arch/x86/kernel/verify_cpu.S](https://github.com/torvalds/linux/blob/v4.16/arch/x86/kernel/verify_cpu.S) assembly file and just contains a couple of calls to the [cpuid](https://en.wikipedia.org/wiki/CPUID) instruction. This instruction is used to get information about the processor. In our case, it checks for `long mode` and `SSE` support and sets the `eax` register to `0` on success and `1` on failure.
-
-If the value of `eax` is not zero, we jump to the `no_longmode` label which just stops the CPU with the `hlt` instruction while no hardware interrupt can happen:
+The `verify_cpu` function defined in the [arch/x86/kernel/verify_cpu.S](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/verify_cpu.S) and executes the [cpuid](https://en.wikipedia.org/wiki/CPUID) instruction to check the details of the processors on which kernel is running on. In our case, the most crucial check is for `long mode` and [SSE](http://en.wikipedia.org/wiki/Streaming_SIMD_Extensions) support. and sets the `eax` register to `0` on success and `1` on failure. If the long mode is not supported by the current processor, the kernel jumps to the `no_longmode` label which just stops the CPU with the `hlt` instruction:
+
```assembly
-no_longmode:
+ .code32
+SYM_FUNC_START_LOCAL_NOALIGN(.Lno_longmode)
+ /* This isn't an x86-64 CPU, so hang intentionally, we cannot continue */
1:
hlt
jmp 1b
```
-If the value of the `eax` register is zero, everything is ok and we can continue.
-
-Calculate the relocation address
---------------------------------------------------------------------------------
-
-The next step is to calculate the relocation address for decompression if needed. First, we need to know what it means for a kernel to be `relocatable`. We already know that the base address of the 32-bit entry point of the Linux kernel is `0x100000`, but that is a 32-bit entry point. The default base address of the Linux kernel is determined by the value of the `CONFIG_PHYSICAL_START` kernel configuration option. Its default value is `0x1000000` or `16 MB`. The main problem here is that if the Linux kernel crashes, a kernel developer must have a `rescue kernel` for [kdump](https://www.kernel.org/doc/Documentation/kdump/kdump.txt) which is configured to load from a different address. The Linux kernel provides a special configuration option to solve this problem: `CONFIG_RELOCATABLE`. As we can read in the documentation of the Linux kernel:
-
-```
-This builds a kernel image that retains relocation information
-so it can be loaded someplace besides the default 1MB.
-
-Note: If CONFIG_RELOCATABLE=y, then the kernel runs from the address
-it has been loaded at and the compile time physical address
-(CONFIG_PHYSICAL_START) is used as the minimum location.
-```
-
-Now that we know where to start, let's get to it.
+If everything is ok, the kernel proceeds its work.
-Reload the segments if needed
---------------------------------------------------------------------------------
+### Calculation of the kernel relocation address
-As indicated above, we start in the [arch/x86/boot/compressed/head_64.S](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/boot/compressed/head_64.S) assembly source code file. We first see the definition of a special section attribute before the definition of the `startup_32` function:
+The next step is to calculate the address for the kernel decompression. The kernel consists of two parts:
-```assembly
- __HEAD
- .code32
-ENTRY(startup_32)
-```
-
-`__HEAD` is a macro defined in the [include/linux/init.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/init.h) header file and expands to the definition of the following section:
-
-```C
-#define __HEAD .section ".head.text","ax"
-```
-
-Here, `.head.text` is the name of the section and `ax` is a set of flags. In our case, these flags show us that this section is [executable](https://en.wikipedia.org/wiki/Executable). In simple terms, this means that a Linux kernel with this option set can be booted from different addresses. Technically, this is done by compiling the decompressor as [position independent code](https://en.wikipedia.org/wiki/Position-independent_code). If we look at [arch/x86/boot/compressed/Makefile](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/compressed/Makefile), we can see that the decompressor is indeed compiled with the `-fPIC` flag:
-
-```Makefile
-KBUILD_CFLAGS += -fno-strict-aliasing -fPIC
-```
+- Relatively small decompressor code
+- Chunk of compressed kernel code
-When we are using position-independent code an address is obtained by adding the address field of the instruction to the value of the program counter. We can load code which uses such addressing from any address. That's why we had to get the real physical address of `startup_32`. Now let's get back to the Linux kernel code. Our current goal is to calculate an address where we can relocate the kernel for decompression. The calculation of this address depends on the `CONFIG_RELOCATABLE` kernel configuration option. Let's look at the code:
+Obviously, the final decompressed kernel code will be bigger than compressed image. The memory area where the decompressed kernel should locate may overlap with the area where the compressed image is located. In this case, the compressed image could be overwritten during decompression process. To avoid this, the the kernel will copy the compressed part for safe decompression. This is done by the following code:
+
```assembly
#ifdef CONFIG_RELOCATABLE
movl %ebp, %ebx
@@ -353,212 +346,140 @@ When we are using position-independent code an address is obtained by adding the
notl %eax
andl %eax, %ebx
cmpl $LOAD_PHYSICAL_ADDR, %ebx
- jge 1f
+ jae 1f
#endif
movl $LOAD_PHYSICAL_ADDR, %ebx
-```
-
-Remember that the value of the `ebp` register is the physical address of the `startup_32` label. If the `CONFIG_RELOCATABLE` kernel configuration option is enabled during kernel configuration, we put this address in the `ebx` register, align it to a multiple of `2MB` and compare it with the result of the `LOAD_PHYSICAL_ADDR` macro. `LOAD_PHYSICAL_ADDR` is defined in the [arch/x86/include/asm/boot.h](https://github.com/torvalds/linux/blob/v4.16/arch/x86/include/asm/boot.h) header file and it looks like this:
-
-```C
-#define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \
- + (CONFIG_PHYSICAL_ALIGN - 1)) \
- & ~(CONFIG_PHYSICAL_ALIGN - 1))
-```
-
-As we can see it just expands to the aligned `CONFIG_PHYSICAL_ALIGN` value which represents the physical address where the kernel will be loaded. After comparing `LOAD_PHYSICAL_ADDR` and the value of the `ebx` register, we add the offset from `startup_32` where we will decompress the compressed kernel image. If the `CONFIG_RELOCATABLE` option is not enabled during kernel configuration, we just add `z_extract_offset` to the default address where the kernel is loaded.
-
-After all of these calculations, `ebp` will contain the address where we loaded the kernel and `ebx` will contain the address where the decompressed kernel will be relocated. But that is not the end. The compressed kernel image should be moved to the end of the decompression buffer to simplify calculations regarding where the kernel will be located later. For this:
-
-```assembly
1:
- movl BP_init_size(%esi), %eax
- subl $_end, %eax
- addl %eax, %ebx
-```
-
-we put the value from the `boot_params.BP_init_size` field (or the kernel setup header value from `hdr.init_size`) in the `eax` register. The `BP_init_size` field contains the larger of the compressed and uncompressed [vmlinux](https://en.wikipedia.org/wiki/Vmlinux) sizes. Next we subtract the address of the `_end` symbol from this value and add the result of the subtraction to the `ebx` register which will store the base address for kernel decompression.
-
-Preparation before entering long mode
---------------------------------------------------------------------------------
-
-After we get the address to relocate the compressed kernel image to, we need to do one last step before we can transition to 64-bit mode. First, we need to update the [Global Descriptor Table](https://en.wikipedia.org/wiki/Global_Descriptor_Table) with 64-bit segments because a relocatable kernel is runnable at any address below 512GB:
-
-```assembly
- addl %ebp, gdt+2(%ebp)
- lgdt gdt(%ebp)
-```
-Here we adjust the base address of the Global Descriptor table to the address where we actually loaded the kernel and load the `Global Descriptor Table` with the `lgdt` instruction.
-
-To understand the magic with `gdt` offsets we need to look at the definition of the `Global Descriptor Table`. We can find its definition in the same source code [file](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/compressed/head_64.S):
-
-```assembly
- .data
-gdt64:
- .word gdt_end - gdt
- .long 0
- .word 0
- .quad 0
-gdt:
- .word gdt_end - gdt
- .long gdt
- .word 0
- .quad 0x00cf9a000000ffff /* __KERNEL32_CS */
- .quad 0x00af9a000000ffff /* __KERNEL_CS */
- .quad 0x00cf92000000ffff /* __KERNEL_DS */
- .quad 0x0080890000000000 /* TS descriptor */
- .quad 0x0000000000000000 /* TS continued */
-gdt_end:
+ /* Target address to relocate to for decompression */
+ addl BP_init_size(%esi), %ebx
+ subl $ rva(_end), %ebx
```
-We can see that it is located in the `.data` section and contains five descriptors: the first is a `32-bit` descriptor for the kernel code segment, a `64-bit` kernel segment, a kernel data segment and two task descriptors.
+The `ebp` register contains current address of the beginning of the kernel image. We put this address to the `ebx` register and aligned it by the `2MB` border. If the resulted address equal or bigger than `LOAD_PHYSICAL_ADDRESS` which is `0x1000000` we use it as is, otherwise we set it to `0x1000000`. Since we have the beginning of the address where to move the compressed kernel image, we add to it `BP_init_size` which is the size of decompressed kernel image. This will allow us to copy compressed kernel image behind the memory area where the kernel will be decompressed. In the end we just subtract the address of the `_end` from the value in the `ebx` to get the new base address of the decompressor kernel code.
-We already loaded the `Global Descriptor Table` in the previous [part](https://github.com/0xAX/linux-insides/blob/v4.16/Booting/linux-bootstrap-3.md), and now we're doing almost the same here, but we set descriptors to use `CS.L = 1` and `CS.D = 0` for execution in `64` bit mode. As we can see, the definition of the `gdt` starts with a two byte value: `gdt_end - gdt` which represents the address of the last byte in the `gdt` table or the table limit. The next four bytes contain the base address of the `gdt`.
+### Enabling PAE mode
-After we have loaded the `Global Descriptor Table` with the `lgdt` instruction, we must enable [PAE](http://en.wikipedia.org/wiki/Physical_Address_Extension) by putting the value of the `cr4` register into `eax`, setting the 5th bit and loading it back into `cr4`:
+The next step is to setup so-called `PAE` mode:
+
```assembly
+ /* Enable PAE mode */
movl %cr4, %eax
orl $X86_CR4_PAE, %eax
movl %eax, %cr4
```
-Now we are almost finished with the preparations needed to move into 64-bit mode. The last step is to build page tables, but before that, here is some information about long mode.
-
-Long mode
---------------------------------------------------------------------------------
-
-[Long mode](https://en.wikipedia.org/wiki/Long_mode) is the native mode for [x86_64](https://en.wikipedia.org/wiki/X86-64) processors. First, let's look at some differences between `x86_64` and `x86`.
-
-`64-bit` mode provides the following features:
-
-* 8 new general purpose registers from `r8` to `r15`
-* All general purpose registers are 64-bit now
-* A 64-bit instruction pointer - `RIP`
-* A new operating mode - Long mode;
-* 64-Bit Addresses and Operands;
-* RIP Relative Addressing (we will see an example of this in the coming parts).
-
-Long mode is an extension of the legacy protected mode. It consists of two sub-modes:
+We doing it by setting the `X86_CR4_PAE` bit in the `cr4` [control register](https://en.wikipedia.org/wiki/Control_register). This tells to CPU that the page table entries that we will see soon will be enlarged from `32` to `64` bits.
-* 64-bit mode;
-* compatibility mode.
+## Setup paging
-To switch into `64-bit` mode we need to do the following things:
+At this moment we are almost finished with the preparations needed to switch the processor into the 64-bit mode. One of the last step, is to build [page tables](https://en.wikipedia.org/wiki/Page_table). But before we will take a look at the process of page tables setup, let's try briefly understand what is it.
-* Enable [PAE](https://en.wikipedia.org/wiki/Physical_Address_Extension);
-* Build page tables and load the address of the top level page table into the `cr3` register;
-* Enable `EFER.LME`;
-* Enable paging.
+As we mentioned in the beginning of this chapter - on `x86_64`, the processor must have paging enabled to use long mode. Paging lets the processor translate [virtual addresses](https://en.wikipedia.org/wiki/Virtual_address_space) or addresses used by the code, into a [physical addresses](https://en.wikipedia.org/wiki/Physical_address). The translation of virtual addresses into physical done using the special structure - page tables. All the memory considered as array of sequential blocks called pages. Each page is described by the special descriptor in the page table called `PTE` or page table entry. The page table entries are stored in the special structure called page tables. The page table is a structure with predefined hierarchy:
-We already enabled `PAE` by setting the `PAE` bit in the `cr4` control register. Our next goal is to build the structure for [paging](https://en.wikipedia.org/wiki/Paging). We will discuss this in the next paragraph.
+- `PML4` - top level table, each entry points to `PDPT`
+- `PDPT` - 3rd level table, each entry points to `PD`
+- `PD` - 2nd level table, each entry poitns to `PT`
+- `PT` - 1st level table, each entry points to a 4 killobyte physical page
-Early page table initialization
---------------------------------------------------------------------------------
+The physical address of the top level table must be stored in the `cr3` register.
-We already know that before we can move into `64-bit` mode, we need to build page tables. Let's look at how the early `4G` boot page tables are built.
+When the processor needs to translate a virtual address into the corresponding physical address, it splits the virtual address to the next parts:
-**NOTE: I will not describe the theory of virtual memory here. If you want to know more about virtual memory, check out the links at the end of this part.**
+
-The Linux kernel uses `4-level` paging, and we generally build 6 page tables:
+Knowing the index of the corresponding entry in each table, CPU obtains the physical address.
-* One `PML4` or `Page Map Level 4` table with one entry;
-* One `PDP` or `Page Directory Pointer` table with four entries;
-* Four Page Directory tables with a total of `2048` entries.
-
-Let's look at how this is implemented. First, we clear the buffer for the page tables in memory. Every table is `4096` bytes, so we need clear a `24` kilobyte buffer:
+The next goal of the kernel is to build a structure similar to the description above to switch to long mode. Let's take a look how it is implemented in the kernel. First of all we need to fill the current page table structure specified by the [pgtable](https://github.com/torvalds/linux/blob/master/arch/x86/boot/compressed/head_64.S#L533) with zeros for safeness:
+
```assembly
- leal pgtable(%ebx), %edi
+ leal rva(pgtable)(%ebx), %edi
xorl %eax, %eax
movl $(BOOT_INIT_PGT_SIZE/4), %ecx
rep stosl
```
-We put the address of `pgtable` with an offset of `ebx` (remember that `ebx` points to the location in memory where the kernel will be decompressed later) into the `edi` register, clear the `eax` register and set the `ecx` register to `6144`.
-
-The `rep stosl` instruction will write the value of `eax` to the memory location where `edi` points to, increment `edi` by `4`, and decrement `ecx` by `1`. This operation will be repeated while the value of the `ecx` register is greater than zero. That's why we put `6144` or `BOOT_INIT_PGT_SIZE/4` in `ecx`.
-
-`pgtable` is defined at the end of the [arch/x86/boot/compressed/head_64.S](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/compressed/head_64.S) assembly file:
+After we cleaned the memory area for the page tables, we can start to fill it. First of all, we need to fill the top-level page entry:
+
```assembly
- .section ".pgtable","a",@nobits
- .balign 4096
-pgtable:
- .fill BOOT_PGT_SIZE, 1, 0
+ leal rva(pgtable + 0)(%ebx), %edi
+ leal 0x1007 (%edi), %eax
+ movl %eax, 0(%edi)
+ addl %edx, 4(%edi)
```
-As we can see, it is located in the `.pgtable` section and its size depends on the `CONFIG_X86_VERBOSE_BOOTUP` kernel configuration option:
+This adds the first entry to the top-level page table. This entry will contain a reference to the first entry of the lower-level table. The offset to it is `0x1000` bytes. The `0x7` are flags of the page table entry:
-```C
-# ifdef CONFIG_X86_VERBOSE_BOOTUP
-# define BOOT_PGT_SIZE (19*4096)
-# else /* !CONFIG_X86_VERBOSE_BOOTUP */
-# define BOOT_PGT_SIZE (17*4096)
-# endif
-# else /* !CONFIG_RANDOMIZE_BASE */
-# define BOOT_PGT_SIZE BOOT_INIT_PGT_SIZE
-# endif
-```
+- Present
+- Read/Write
+- User
-After we have a buffer for the `pgtable` structure, we can start to build the top level page table - `PML4` - with:
+Each page entry is `64-bit` structure, no matter if it is a `PML4`, `PDPT`, `PD` or `PT` entry. The format is almost the same among all the levels. The difference is only in the address field which stores the physical address of the next page table by hierarchy. Besides the address field, a page table entry contains flags like:
-```assembly
- leal pgtable + 0(%ebx), %edi
- leal 0x1007 (%edi), %eax
- movl %eax, 0(%edi)
-```
+- `P` - present bit
+- `RW` - read/write bit
+- `US` - user/supervisor bit
+- `PWT` - Page-level Write-Through bit controlling caching of the page
+- `PCD` - Page Cache Disable bit controlling caching of the page
+- `A` - accessed page bit
+- `D` - dirty page bit
+- `PS` - page size bit
+- `NX` - No-Execute bit
-Here again, we put the address of `pgtable` relative to `ebx` or in other words relative to address of `startup_32` in the `edi` register. Next, we put this address with an offset of `0x1007` into the `eax` register. `0x1007` is the result of adding the size of the `PML4` table which is `4096` or `0x1000` bytes with `7`. The `7` here represents the flags associated with the `PML4` entry. In our case, these flags are `PRESENT+RW+USER`. In the end, we just write the address of the first `PDP` entry to the `PML4` table.
+More information about the page tables and page table entries structure you can find in the [Intel® 64 and IA-32 Architectures Software Developer Manuals](https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sdm.html).
-In the next step we will build four `Page Directory` entries in the `Page Directory Pointer` table with the same `PRESENT+RW+USE` flags:
+In the next step we will build four `Page Directory` entries in the `Page Directory Pointer` table with the same `Present+Read/Write/User` flags:
+
```assembly
- leal pgtable + 0x1000(%ebx), %edi
+ leal rva(pgtable + 0x1000)(%ebx), %edi
leal 0x1007(%edi), %eax
movl $4, %ecx
-1: movl %eax, 0x00(%edi)
+1: movl %eax, 0x00(%edi)
+ addl %edx, 0x04(%edi)
addl $0x00001000, %eax
addl $8, %edi
decl %ecx
jnz 1b
```
-We set `edi` to the base address of the page directory pointer which is at an offset of `4096` or `0x1000` bytes from the `pgtable` table and `eax` to the address of the first page directory pointer entry. We also set `ecx` to `4` to act as a counter in the following loop and write the address of the first page directory pointer table entry to the `edi` register. After this, `edi` will contain the address of the first page directory pointer entry with flags `0x7`. Next we calculate the address of the following page directory pointer entries — each entry is `8` bytes — and write their addresses to `eax`. The last step in building the paging structure is to build the `2048` page table entries with `2-MByte` pages:
+In the code above, we may see filling of the first four entries of the 3rd level page table. The first entry is located at the offset `0x1000` from the beginning of the page table. The value of the `eax` register is similar to the 4th level page table entry. Next we just fill the four entries of this table in the "loop" while value of the `ecx` will not be zero. As soon as this table entries are filled, the next turn of the next level page table:
+
```assembly
- leal pgtable + 0x2000(%ebx), %edi
+ leal rva(pgtable + 0x2000)(%ebx), %edi
movl $0x00000183, %eax
movl $2048, %ecx
-1: movl %eax, 0(%edi)
+1: movl %eax, 0(%edi)
+ addl %edx, 4(%edi)
addl $0x00200000, %eax
addl $8, %edi
decl %ecx
jnz 1b
```
-Here we do almost the same things that we did in the previous example, all entries are associated with these flags - `$0x00000183` - `PRESENT + WRITE + MBZ`. In the end, we will have a page table with `2048` `2-MByte` pages, which represents a 4 Gigabyte block of memory:
+Here we already fill 4 page directories with 2048 entries. The first entry is located at the offset `0x2000` from the beginning of the page table. Each entry maps a 2 megabytes chunk of memory with the same `Present/Read/Write/Large Page` flags but in addition there is `Global` flag. This additional flag tells the processor to keep [TLB](https://en.wikipedia.org/wiki/Translation_lookaside_buffer) entry across reload of the value of the `cr3` register.
-```python
->>> 2048 * 0x00200000
-4294967296
-```
+This was the last page table entries which kernel fills. There is no need for this moment to fill the 4th level `PT` tables because every at the 2nd level page table was filled with the `Large Page` bit, so each such entry directly maps a 2 megabytes region. During the address transition, the page-walk procedure stops at the `PD` level going through `PML4 → PDPT → PD`, and the lower `21` bits of the virtual address will be used as the offset inside that 2 megabytes page.
-Since we've just finished building our early page table structure which maps `4` gigabytes of memory, we can put the address of the high-level page table - `PML4` - into the `cr3` control register:
+Now we can enable the paging by storing the address of the page table in the `cr3` register:
+
```assembly
- leal pgtable(%ebx), %eax
+ leal rva(pgtable)(%ebx), %eax
movl %eax, %cr3
```
-That's all. We are now prepared to transition to long mode.
+The page tables is ready and paging is enabled starting from this moment. Now the kernel is prepared for transition into the long mode.
-The transition to 64-bit mode
---------------------------------------------------------------------------------
+## The transition into 64-bit mode
-First of all we need to set the `EFER.LME` flag in the [MSR](http://en.wikipedia.org/wiki/Model-specific_register) to `0xC0000080`:
+Only the last steps are remaining before the Linux kernel can switch CPU into the long mode. The first one is setting the `EFER.LME` flag in the special [model specific register](http://en.wikipedia.org/wiki/Model-specific_register) to the predefined value `0xC0000080`:
+
```assembly
movl $MSR_EFER, %ecx
rdmsr
@@ -566,64 +487,52 @@ First of all we need to set the `EFER.LME` flag in the [MSR](http://en.wikipedia
wrmsr
```
-Here we put the `MSR_EFER` flag (which is defined in [arch/x86/include/asm/msr-index.h](https://github.com/torvalds/linux/blob/v4.16/arch/x86/include/asm/msr-index.h)) in the `ecx` register and execute the `rdmsr` instruction which reads the [MSR](http://en.wikipedia.org/wiki/Model-specific_register) register. After `rdmsr` executes, the resulting data is stored in `edx:eax` according to the `MSR` register specified in `ecx`. We check the current `EFER_LME` bit, transfer it into the carry flag and update the bit, all with the `btsl` instruction. Then we write data from `edx:eax` back to the `MSR` register with the `wrmsr` instruction.
+This is the `Long Mode Enable` bit and it is mandatory action to set this bit to enable `64-bit` mode.
-In the next step, we push the address of the kernel segment code to the stack (we defined it in the GDT) and put the address of the `startup_64` routine in `eax`.
+In the next step, we may see the preparation of the jump on the long mode entrypoint. To do this jump, the kernel stores the base address of the kernel segment code along with the address of the long mode entrypoint on the stack:
+
```assembly
+ leal rva(startup_64)(%ebp), %eax
pushl $__KERNEL_CS
- leal startup_64(%ebp), %eax
-```
-
-After this we push `eax` to the stack and enable paging by setting the `PG` and `PE` bits in the `cr0` register:
-
-```assembly
pushl %eax
- movl $(X86_CR0_PG | X86_CR0_PE), %eax
- movl %eax, %cr0
```
-We then execute the `lret` instruction:
+Everything is ready. Since our stack contains the base of the kernel code segment and the address of the entrypoint, kernel executes the last instruction in protected mode:
+
```assembly
-lret
+ lret
```
-Remember that we pushed the address of the `startup_64` function to the stack in the previous step. The CPU extracts `startup_64`'s address from the stack and jumps there.
-
-After all of these steps we're finally in 64-bit mode:
+The CPU extracts the address of the `startup_64` from the stack and jumps there:
+
```assembly
.code64
.org 0x200
-ENTRY(startup_64)
-....
-....
-....
+SYM_CODE_START(startup_64)
```
-That's all!
+The Linux kernel now in 64-bit mode 🎉
-Conclusion
---------------------------------------------------------------------------------
-This is the end of the fourth part of the linux kernel booting process. If you have any questions or suggestions, ping me on twitter [0xAX](https://twitter.com/0xAX), drop me an [email](mailto:anotherworldofworld@gmail.com) or just create an [issue](https://github.com/0xAX/linux-insides/issues/new).
+## Conclusion
-In the next part, we will learn about many things, including how kernel decompression works.
+This is the end of the third part about Linux kernel insides. If you have questions or suggestions, feel free ping me on X - [0xAX](https://twitter.com/0xAX), drop me an [email](mailto:anotherworldofworld@gmail.com), or just create an [issue](https://github.com/0xAX/linux-insides/issues/new).
-**Please note that English is not my first language and I am really sorry for any inconvenience. If you find any mistakes please send a PR to [linux-insides](https://github.com/0xAX/linux-internals).**
+## Links
-Links
---------------------------------------------------------------------------------
+Here is the list of the links that you may find useful during reading of this chapter:
-* [Protected mode](http://en.wikipedia.org/wiki/Protected_mode)
-* [Intel® 64 and IA-32 Architectures Software Developer’s Manual 3A](http://www.intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html)
-* [GNU linker](http://www.eecs.umich.edu/courses/eecs373/readings/Linker.pdf)
-* [SSE](http://en.wikipedia.org/wiki/Streaming_SIMD_Extensions)
-* [Paging](http://en.wikipedia.org/wiki/Paging)
-* [Model specific register](http://en.wikipedia.org/wiki/Model-specific_register)
-* [.fill instruction](http://www.chemie.fu-berlin.de/chemnet/use/info/gas/gas_7.html)
-* [Previous part](https://github.com/0xAX/linux-insides/blob/v4.16/Booting/linux-bootstrap-3.md)
-* [Paging on osdev.org](http://wiki.osdev.org/Paging)
-* [Paging Systems](https://www.cs.rutgers.edu/~pxk/416/notes/09a-paging.html)
-* [x86 Paging Tutorial](http://www.cirosantilli.com/x86-paging/)
+- [Real mode](https://en.wikipedia.org/wiki/Real_mode)
+- [Protected mode](http://en.wikipedia.org/wiki/Protected_mode)
+- [Long mode](https://en.wikipedia.org/wiki/Long_mode)
+- [Linux kernel x86 boot protocol](https://www.kernel.org/doc/Documentation/x86/boot.txt)
+- [Intel® 64 and IA-32 Architectures Software Developer Manuals](https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sdm.html)
+- [Paging](http://en.wikipedia.org/wiki/Paging)
+- [Virtual addresses](https://en.wikipedia.org/wiki/Virtual_address_space)
+- [Physical addresses](https://en.wikipedia.org/wiki/Physical_address)
+- [Model specific registers](http://en.wikipedia.org/wiki/Model-specific_register)
+- [Control registers](https://en.wikipedia.org/wiki/Control_register)
+- [Previous part](https://github.com/0xAX/linux-insides/blob/v4.16/Booting/linux-bootstrap-3.md)
diff --git a/Booting/linux-bootstrap-5.md b/Booting/linux-bootstrap-5.md
index a72b7f5a..94124f36 100644
--- a/Booting/linux-bootstrap-5.md
+++ b/Booting/linux-bootstrap-5.md
@@ -204,7 +204,7 @@ Like before, we push `rsi` onto the stack to preserve the pointer to `boot_param
* `output` - the start address of the decompressed kernel;
* `output_len` - the size of the decompressed kernel;
-All arguments will be passed through registers as per the [System V Application Binary Interface](http://www.x86-64.org/documentation/abi.pdf). We've finished all the preparations and can now decompress the kernel.
+All arguments will be passed through registers as per the [System V Application Binary Interface](https://github.com/hjl-tools/x86-psABI/wiki/x86-64-psABI-1.0.pdf). We've finished all the preparations and can now decompress the kernel.
Kernel decompression
--------------------------------------------------------------------------------
@@ -383,13 +383,13 @@ That's all. Now we are in the kernel!
Conclusion
--------------------------------------------------------------------------------
-This is the end of the fifth part about the linux kernel booting process. We will not see any more posts about the kernel booting process (there may be updates to this and previous posts though), but there will be many posts about other kernel internals.
+This is the end of the fifth part about the Linux kernel booting process. We will not see any more posts about the kernel booting process (there may be updates to this and previous posts though), but there will be many posts about other kernel internals.
-The Next chapter will describe more advanced details about linux kernel booting process, like load address randomization and etc.
+The Next chapter will describe more advanced details about Linux kernel booting process, like load address randomization and etc.
If you have any questions or suggestions write me a comment or ping me in [twitter](https://twitter.com/0xAX).
-**Please note that English is not my first language, And I am really sorry for any inconvenience. If you find any mistakes please send me PR to [linux-insides](https://github.com/0xAX/linux-internals).**
+**Please note that English is not my first language, And I am really sorry for any inconvenience. If you find any mistakes please send me PR to [linux-insides](https://github.com/0xAX/linux-insides).**
Links
--------------------------------------------------------------------------------
diff --git a/Booting/linux-bootstrap-6.md b/Booting/linux-bootstrap-6.md
index 6bea71d7..f6f8f3d2 100644
--- a/Booting/linux-bootstrap-6.md
+++ b/Booting/linux-bootstrap-6.md
@@ -46,7 +46,7 @@ This function takes five parameters:
* `input`;
* `input_size`;
* `output`;
- * `output_isze`;
+ * `output_size`;
* `virt_addr`.
Let's try to understand what these parameters are. The first parameter, `input` is just the `input_data` parameter of the `extract_kernel` function from the [arch/x86/boot/compressed/misc.c](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/compressed/misc.c) source code file, cast to `unsigned long`:
@@ -91,7 +91,7 @@ input_data:
input_data_end:
```
-As you can see, it contains four global symbols. The first two, `z_input_len` and `z_output_len` are the sizes of the compressed and uncompressed `vmlinux.bin.gz` archive. The third is our `input_data` parameter which points to the linux kernel image's raw binary (stripped of all debugging symbols, comments and relocation information). The last parameter, `input_data_end`, points to the end of the compressed linux image.
+As you can see, it contains four global symbols. The first two, `z_input_len` and `z_output_len` are the sizes of the compressed and uncompressed `vmlinux.bin.gz` archive. The third is our `input_data` parameter which points to the Linux kernel image's raw binary (stripped of all debugging symbols, comments and relocation information). The last parameter, `input_data_end`, points to the end of the compressed linux image.
So, the first parameter to the `choose_random_location` function is the pointer to the compressed kernel image that is embedded into the `piggy.o` object file.
@@ -146,7 +146,7 @@ Now, we call another function:
initialize_identity_maps();
```
-The `initialize_identity_maps` function is defined in the [arch/x86/boot/compressed/kaslr_64.c](https://github.com/torvalds/linux/blob/master/arch/x86/boot/compressed/kaslr_64.c) source code file. This function starts by initialising an instance of the `x86_mapping_info` structure called `mapping_info`:
+The `initialize_identity_maps` function is defined in the [arch/x86/boot/compressed/kaslr_64.c](https://github.com/torvalds/linux/blob/master/arch/x86/boot/compressed/kaslr_64.c) source code file. This function starts by initializing an instance of the `x86_mapping_info` structure called `mapping_info`:
```C
mapping_info.alloc_pgt_page = alloc_pgt_page;
@@ -254,7 +254,7 @@ add_identity_map(mem_avoid[MEM_AVOID_ZO_RANGE].start,
mem_avoid[MEM_AVOID_ZO_RANGE].size);
```
-THe `mem_avoid_init` function first tries to avoid memory regions currently used to decompress the kernel. We fill an entry from the `mem_avoid` array with the start address and the size of the relevant region and call the `add_identity_map` function, which builds the identity mapped pages for this region. The `add_identity_map` function is defined in the [arch/x86/boot/compressed/kaslr_64.c](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/compressed/kaslr_64.c) source code file and looks like this:
+The `mem_avoid_init` function first tries to avoid memory regions currently used to decompress the kernel. We fill an entry from the `mem_avoid` array with the start address and the size of the relevant region and call the `add_identity_map` function, which builds the identity mapped pages for this region. The `add_identity_map` function is defined in the [arch/x86/boot/compressed/kaslr_64.c](https://github.com/torvalds/linux/blob/v4.16/arch/x86/boot/compressed/kaslr_64.c) source code file and looks like this:
```C
void add_identity_map(unsigned long start, unsigned long size)
@@ -395,13 +395,13 @@ That's all.
Conclusion
--------------------------------------------------------------------------------
-This is the end of the sixth and last part concerning the linux kernel's booting process. We will not see any more posts about kernel booting (though there may be updates to this and previous posts). We will now turn to other parts of the linux kernel instead.
+This is the end of the sixth and last part concerning the Linux kernel's booting process. We will not see any more posts about kernel booting (though there may be updates to this and previous posts). We will now turn to other parts of the linux kernel instead.
The next chapter will be about kernel initialization and we will study the first steps take in the Linux kernel initialization code.
If you have any questions or suggestions write me a comment or ping me in [twitter](https://twitter.com/0xAX).
-**Please note that English is not my first language, And I am really sorry for any inconvenience. If you find any mistakes please send me PR to [linux-insides](https://github.com/0xAX/linux-internals).**
+**Please note that English is not my first language, And I am really sorry for any inconvenience. If you find any mistakes please send me PR to [linux-insides](https://github.com/0xAX/linux-insides).**
Links
--------------------------------------------------------------------------------
diff --git a/CODEOWNERS b/CODEOWNERS
new file mode 100644
index 00000000..9b2e7c87
--- /dev/null
+++ b/CODEOWNERS
@@ -0,0 +1,5 @@
+# Owner of the repository
+* @0xAX
+
+# Documentation owners
+*.md @0xAX @klaudiagrz
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
new file mode 100644
index 00000000..b160fe62
--- /dev/null
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,128 @@
+# Contributor Covenant Code of Conduct
+
+## Our Pledge
+
+We as members, contributors, and leaders pledge to make participation in our
+community a harassment-free experience for everyone, regardless of age, body
+size, visible or invisible disability, ethnicity, sex characteristics, gender
+identity and expression, level of experience, education, socio-economic status,
+nationality, personal appearance, race, religion, or sexual identity
+and orientation.
+
+We pledge to act and interact in ways that contribute to an open, welcoming,
+diverse, inclusive, and healthy community.
+
+## Our Standards
+
+Examples of behavior that contributes to a positive environment for our
+community include:
+
+* Demonstrating empathy and kindness toward other people
+* Being respectful of differing opinions, viewpoints, and experiences
+* Giving and gracefully accepting constructive feedback
+* Accepting responsibility and apologizing to those affected by our mistakes,
+ and learning from the experience
+* Focusing on what is best not just for us as individuals, but for the
+ overall community
+
+Examples of unacceptable behavior include:
+
+* The use of sexualized language or imagery, and sexual attention or
+ advances of any kind
+* Trolling, insulting or derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or email
+ address, without their explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+ professional setting
+
+## Enforcement Responsibilities
+
+Community leaders are responsible for clarifying and enforcing our standards of
+acceptable behavior and will take appropriate and fair corrective action in
+response to any behavior that they deem inappropriate, threatening, offensive,
+or harmful.
+
+Community leaders have the right and responsibility to remove, edit, or reject
+comments, commits, code, wiki edits, issues, and other contributions that are
+not aligned to this Code of Conduct, and will communicate reasons for moderation
+decisions when appropriate.
+
+## Scope
+
+This Code of Conduct applies within all community spaces, and also applies when
+an individual is officially representing the community in public spaces.
+Examples of representing our community include using an official e-mail address,
+posting via an official social media account, or acting as an appointed
+representative at an online or offline event.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported to the community leaders responsible for enforcement at
+kuleshovmail@gmail.com.
+All complaints will be reviewed and investigated promptly and fairly.
+
+All community leaders are obligated to respect the privacy and security of the
+reporter of any incident.
+
+## Enforcement Guidelines
+
+Community leaders will follow these Community Impact Guidelines in determining
+the consequences for any action they deem in violation of this Code of Conduct:
+
+### 1. Correction
+
+**Community Impact**: Use of inappropriate language or other behavior deemed
+unprofessional or unwelcome in the community.
+
+**Consequence**: A private, written warning from community leaders, providing
+clarity around the nature of the violation and an explanation of why the
+behavior was inappropriate. A public apology may be requested.
+
+### 2. Warning
+
+**Community Impact**: A violation through a single incident or series
+of actions.
+
+**Consequence**: A warning with consequences for continued behavior. No
+interaction with the people involved, including unsolicited interaction with
+those enforcing the Code of Conduct, for a specified period of time. This
+includes avoiding interactions in community spaces as well as external channels
+like social media. Violating these terms may lead to a temporary or
+permanent ban.
+
+### 3. Temporary Ban
+
+**Community Impact**: A serious violation of community standards, including
+sustained inappropriate behavior.
+
+**Consequence**: A temporary ban from any sort of interaction or public
+communication with the community for a specified period of time. No public or
+private interaction with the people involved, including unsolicited interaction
+with those enforcing the Code of Conduct, is allowed during this period.
+Violating these terms may lead to a permanent ban.
+
+### 4. Permanent Ban
+
+**Community Impact**: Demonstrating a pattern of violation of community
+standards, including sustained inappropriate behavior, harassment of an
+individual, or aggression toward or disparagement of classes of individuals.
+
+**Consequence**: A permanent ban from any sort of public interaction within
+the community.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage],
+version 2.0, available at
+https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
+
+Community Impact Guidelines were inspired by [Mozilla's code of conduct
+enforcement ladder](https://github.com/mozilla/diversity).
+
+[homepage]: https://www.contributor-covenant.org
+
+For answers to common questions about this code of conduct, see the FAQ at
+https://www.contributor-covenant.org/faq. Translations are available at
+https://www.contributor-covenant.org/translations.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 9caf3295..8dc4752d 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,33 +1,45 @@
-Contributing
-================================================================================
+# Contributing
-If you want to contribute to [linux-insides](https://github.com/0xAX/linux-insides), please follow these simple rules:
+This document outlines the contribution workflow, starting from opening an issue, creating a pull request (PR), reviewing, and merging the PR. When working on this project, make sure to follow the [Code of Conduct](./CODE_OF_CONDUCT.md).
-1. Press the fork button:
+Thank you for your contribution.
- 
+## New contributor guide
-2. Clone the repository from your account with:
+If you are a new open source contributor, here are some resources you may find useful before providing your first contributions:
- ```
- git clone git@github.com:your_github_username/linux-insides.git
- ```
+- [Finding ways to contribute to open source on GitHub](https://docs.github.com/en/get-started/exploring-projects-on-github/finding-ways-to-contribute-to-open-source-on-github)
+- [Set up Git](https://docs.github.com/en/get-started/getting-started-with-git/set-up-git)
+- [GitHub flow](https://docs.github.com/en/get-started/using-github/github-flow)
+- [Collaborating with pull requests](https://docs.github.com/en/github/collaborating-with-pull-requests)
-3. Create a new branch with:
+**Working on your first pull request?** You can learn how from this free series [How to Contribute to an Open Source Project on GitHub](https://kcd.im/pull-request).
- ```
- git checkout -b "linux-bootstrap-1-fix"
- ```
- You can name it however you want.
+## Create an issue
-4. Make your changes.
+If you have any improvement ideas, notice a missing feature or a bug, create a GitHub issue by clicking **Issues -> New issue** in GitHub. Make sure to fill the issue template with a detailed description of the bug or suggested improvements. Provide proper argumentation and screenshots, if necessary.
-5. Don't forget to add yourself in `contributors.md`.
+If you find any existing issue to work on, you are welcome to open a PR with a fix.
-6. Commit and push your changes, then make a pull request from Github.
+## Open a pull request
-**IMPORTANT**
+If you want to directly contribute to the project, create a pull reguest with the suggested changes. To do so:
-Please, don't forget to update your fork. While you made your changes, the content of the `master` branch can change because other pull requests were merged and it can create conflicts. This is why you have to rebase on `master` every time before pushing your changes and check that your branch doesn't have any conflicts with `master`.
+1. [Fork the repository](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/fork-a-repo#fork-an-example-repository).
-Thank you.
+2. Make changes on your local copy of the forked repository.
+
+3. Commit and push the changes to GitHub.
+
+> [!IMPORTANT]
+> Don't forget to update your fork. Since many contributors may be working on the same content based on the `master` branch, some merge conflicts may occur. Remember to rebase with `master` every time before pushing your changes and make sure your branch doesn't have any conflicts with `master`. If you run into any merge conflicts, read the [Resolve merge conflicts](https://github.com/skills/resolve-merge-conflicts) tutorial to learn how to resolve merge conflicts and other issues.
+
+4. Open a pull request in GitHub. Fill the pull request template with the reason and description for the provided changes. Link your pull request with the existing issue, if applicable. After submitting your PR, wait for the review from the project maintainers.
+
+## Review and approval process
+
+After you submit your PR, wait for the review. The project maintainers will evaluate your changes and provide feedback either using [suggested changes](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/incorporating-feedback-in-your-pull-request) or pull request comments. Address the review suggestions and comments as soon as you can. If your PR looks good, the maintainers approve and merge it.
+
+## Contributors
+
+All contributions get credit in [Contributors](contributors.md). Don't forget to add yourself there.
diff --git a/Cgroups/images/menuconfig.png b/Cgroups/images/menuconfig.png
index 8e2324a1..13f1d5d5 100644
Binary files a/Cgroups/images/menuconfig.png and b/Cgroups/images/menuconfig.png differ
diff --git a/Cgroups/linux-cgroups-1.md b/Cgroups/linux-cgroups-1.md
index 2aa1167b..79319ad0 100644
--- a/Cgroups/linux-cgroups-1.md
+++ b/Cgroups/linux-cgroups-1.md
@@ -30,7 +30,7 @@ Each of these control group subsystems depends on related configuration option.
You may see enabled control groups on your computer via [proc](https://en.wikipedia.org/wiki/Procfs) filesystem:
```
-$ cat /proc/cgroups
+$ cat /proc/cgroups
#subsys_name hierarchy num_cgroups enabled
cpuset 8 1 1
cpu 7 66 1
@@ -90,7 +90,7 @@ So, if we will run this script we will see following result:
```
$ sudo chmod +x cgroup_test_script.sh
-~$ ./cgroup_test_script.sh
+~$ ./cgroup_test_script.sh
print line
print line
print line
@@ -147,7 +147,7 @@ crw-rw-rw- 1 root tty 5, 0 Dec 3 22:48 /dev/tty
see the first `c` letter in a permissions list. The second part is `5:0` is major and minor numbers of the device. You can see these numbers in the output of `ls` too. And the last `w` letter forbids tasks to write to the specified device. So let's start the `cgroup_test_script.sh` script:
```
-~$ ./cgroup_test_script.sh
+~$ ./cgroup_test_script.sh
print line
print line
print line
@@ -164,7 +164,7 @@ and add pid of this process to the `devices/tasks` file of our group:
The result of this action will be as expected:
```
-~$ ./cgroup_test_script.sh
+~$ ./cgroup_test_script.sh
print line
print line
print line
@@ -174,7 +174,7 @@ print line
./cgroup_test_script.sh: line 5: /dev/tty: Operation not permitted
```
-Similar situation will be when you will run you [docker](https://en.wikipedia.org/wiki/Docker_\(software\)) containers for example:
+Similar situation will be when you will run you [docker](https://en.wikipedia.org/wiki/Docker_(software)) containers for example:
```
~$ docker ps
@@ -213,7 +213,7 @@ Control group /:
│ └─6404 /bin/bash
```
-Now we know a little about `control groups` mechanism, how to use it manually and what's purpose of this mechanism. It's time to look inside of the Linux kernel source code and start to dive into implementation of this mechanism.
+Now we know a little about `control groups` mechanism, how to use it manually and what's the purpose of this mechanism. It's time to look inside of the Linux kernel source code and start to dive into implementation of this mechanism.
Early initialization of control groups
--------------------------------------------------------------------------------
@@ -294,7 +294,7 @@ Here we may see call of the `init_cgroup_root` function which will execute initi
struct cgroup_root cgrp_dfl_root;
```
-Its `cgrp` field represented by the `cgroup` structure which represents a `cgroup` as you already may guess and defined in the [include/linux/cgroup-defs.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/cgroup-defs.h) header file. We already know that a process which is represented by the `task_struct` in the Linux kernel. The `task_struct` does not contain direct link to a `cgroup` where this task is attached. But it may be reached via `css_set` field of the `task_struct`. This `css_set` structure holds pointer to the array of subsystem states:
+Its `cgrp` field represented by the `cgroup` structure which represents a `cgroup` as you already may guess and defined in the [include/linux/cgroup-defs.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/cgroup-defs.h) header file. We already know that a process is represented by the `task_struct` in the Linux kernel. The `task_struct` does not contain direct link to a `cgroup` where this task is attached. But it may be reached via `css_set` field of the `task_struct`. This `css_set` structure holds pointer to the array of subsystem states:
```C
struct css_set {
@@ -324,14 +324,14 @@ struct cgroup_subsys_state {
So, the overall picture of `cgroups` related data structure is following:
-```
+```
+-------------+ +---------------------+ +------------->+---------------------+ +----------------+
| task_struct | | css_set | | | cgroup_subsys_state | | cgroup |
+-------------+ | | | +---------------------+ +----------------+
| | | | | | | | flags |
| | | | | +---------------------+ | cgroup.procs |
| | | | | | cgroup |--------->| id |
-| | | | | +---------------------+ | .... |
+| | | | | +---------------------+ | .... |
|-------------+ |---------------------+----+ +----------------+
| cgroups | ------> | cgroup_subsys_state | array of cgroup_subsys_state
|-------------+ +---------------------+------------------>+---------------------+ +----------------+
diff --git a/Concepts/linux-cpu-2.md b/Concepts/linux-cpu-2.md
index 2ccc951b..59c00568 100644
--- a/Concepts/linux-cpu-2.md
+++ b/Concepts/linux-cpu-2.md
@@ -19,13 +19,13 @@ set_cpu_present(cpu, true);
set_cpu_possible(cpu, true);
```
-Before we will consider implementation of these functions, let's consider all of these masks.
+Before we consider implementation of these functions, let's consider all of these masks.
-The `cpu_possible` is a set of cpu ID's which can be plugged in anytime during the life of that system boot or in other words mask of possible CPUs contains maximum number of CPUs which are possible in the system. It will be equal to value of the `NR_CPUS` which is which is set statically via the `CONFIG_NR_CPUS` kernel configuration option.
+The `cpu_possible` is a set of cpu ID's which can be plugged in anytime during the life of that system boot or in other words mask of possible CPUs contains maximum number of CPUs which are possible in the system. It will be equal to value of the `NR_CPUS` which is set statically via the `CONFIG_NR_CPUS` kernel configuration option.
The `cpu_present` mask represents which CPUs are currently plugged in.
-The `cpu_online` represents a subset of the `cpu_present` and indicates CPUs which are available for scheduling or in other words a bit from this mask tells to kernel is a processor may be utilized by the Linux kernel.
+The `cpu_online` represents a subset of the `cpu_present` and indicates CPUs which are available for scheduling or in other words a bit from this mask tells the kernel if a processor may be utilized by the Linux kernel.
The last mask is `cpu_active`. Bits of this mask tells to Linux kernel is a task may be moved to a certain processor.
@@ -94,9 +94,9 @@ And returns `1` every time. We need it here for only one purpose: at compile tim
cpumask API
--------------------------------------------------------------------------------
-As we can define cpumask with one of the method, Linux kernel provides API for manipulating a cpumask. Let's consider one of the function which presented above. For example `set_cpu_online`. This function takes two parameters:
+As we can define cpumask with one of the methods, Linux kernel provides API for manipulating a cpumask. Let's consider one of the function which presented above. For example `set_cpu_online`. This function takes two parameters:
-* Number of CPU;
+* Index of CPU;
* CPU status;
Implementation of this function looks as:
@@ -113,7 +113,7 @@ void set_cpu_online(unsigned int cpu, bool online)
}
```
-First of all it checks the second `state` parameter and calls `cpumask_set_cpu` or `cpumask_clear_cpu` depends on it. Here we can see casting to the `struct cpumask *` of the second parameter in the `cpumask_set_cpu`. In our case it is `cpu_online_bits` which is a bitmap and defined as:
+First of all it checks the second `state` parameter and calls `cpumask_set_cpu` or `cpumask_clear_cpu` depending on it. Here we can see casting to the `struct cpumask *` of the second parameter in the `cpumask_set_cpu`. In our case it is `cpu_online_bits` which is a bitmap and defined as:
```C
static DECLARE_BITMAP(cpu_online_bits, CONFIG_NR_CPUS) __read_mostly;
@@ -128,7 +128,7 @@ static inline void cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp)
}
```
-The `set_bit` function takes two parameters too, and sets a given bit (first parameter) in the memory (second parameter or `cpu_online_bits` bitmap). We can see here that before `set_bit` will be called, its two parameters will be passed to the
+The `set_bit` function takes two parameters too, and sets a given bit (first parameter) in the memory (second parameter or `cpu_online_bits` bitmap). We can see here that before `set_bit` is called, its two parameters will be passed to the
* cpumask_check;
* cpumask_bits.
diff --git a/Concepts/linux-cpu-4.md b/Concepts/linux-cpu-4.md
index 45dfe62e..b7e7e588 100644
--- a/Concepts/linux-cpu-4.md
+++ b/Concepts/linux-cpu-4.md
@@ -6,12 +6,12 @@ Introduction
The Linux kernel is huge piece of [C](https://en.wikipedia.org/wiki/C_%28programming_language%29) code which consists from many different subsystems. Each subsystem has its own purpose which is independent of other subsystems. But often one subsystem wants to know something from other subsystem(s). There is special mechanism in the Linux kernel which allows to solve this problem partly. The name of this mechanism is - `notification chains` and its main purpose to provide a way for different subsystems to subscribe on asynchronous events from other subsystems. Note that this mechanism is only for communication inside kernel, but there are other mechanisms for communication between kernel and userspace.
-Before we will consider `notification chains` [API](https://en.wikipedia.org/wiki/Application_programming_interface) and implementation of this API, let's look at `Notification chains` mechanism from theoretical side as we did it in other parts of this book. Everything which is related to `notification chains` mechanism is located in the [include/linux/notifier.h](https://github.com/torvalds/linux/blob/master/include/linux/notifier.h) header file and [kernel/notifier.c](https://github.com/torvalds/linux/blob/master/kernel/notifier.c) source code file. So let's open them and start to dive.
+Before we consider `notification chains` [API](https://en.wikipedia.org/wiki/Application_programming_interface) and implementation of this API, let's look at `Notification chains` mechanism from theoretical side as we did it in other parts of this book. Everything which is related to `notification chains` mechanism is located in the [include/linux/notifier.h](https://github.com/torvalds/linux/blob/master/include/linux/notifier.h) header file and [kernel/notifier.c](https://github.com/torvalds/linux/blob/master/kernel/notifier.c) source code file. So let's open them and start to dive.
Notification Chains related data structures
--------------------------------------------------------------------------------
-Let's start to consider `notification chains` mechanism from related data structures. As I wrote above, main data structures should be located in the [include/linux/notifier.h](https://github.com/torvalds/linux/blob/master/include/linux/notifier.h) header file, so the Linux kernel provides generic API which does not depend on certain architecture. In general, the `notification chains` mechanism represents a list (that's why it named `chains`) of [callback](https://en.wikipedia.org/wiki/Callback_%28computer_programming%29) functions which are will be executed when an event will be occurred.
+Let's start to consider `notification chains` mechanism from related data structures. As I wrote above, main data structures should be located in the [include/linux/notifier.h](https://github.com/torvalds/linux/blob/master/include/linux/notifier.h) header file, so the Linux kernel provides generic API which does not depend on certain architecture. In general, the `notification chains` mechanism represents a list (that's why it's named `chains`) of [callback](https://en.wikipedia.org/wiki/Callback_%28computer_programming%29) functions which are will be executed when an event will be occurred.
All of these callback functions are represented as `notifier_fn_t` type in the Linux kernel:
@@ -101,7 +101,7 @@ Now as we know a little about `notification chains` mechanism let's consider imp
Notification Chains
--------------------------------------------------------------------------------
-Usually there are two sides in a publish/subscriber mechanisms. One side who wants to get notifications and other side(s) who generates these notifications. We will consider notification chains mechanism from both sides. We will consider `blocking notification chains` in this part, because of other types of notification chains are similar to it and differs mostly in protection mechanisms.
+Usually there are two sides in a publish/subscriber mechanisms. One side who wants to get notifications and other side(s) who generates these notifications. We will consider notification chains mechanism from both sides. We will consider `blocking notification chains` in this part, because of other types of notification chains are similar to it and differ mostly in protection mechanisms.
Before a notification producer is able to produce notification, first of all it should initialize head of a notification chain. For example let's consider notification chains related to kernel [loadable modules](https://en.wikipedia.org/wiki/Loadable_kernel_module). If we will look in the [kernel/module.c](https://github.com/torvalds/linux/blob/master/kernel/module.c) source code file, we will see following definition:
@@ -120,7 +120,7 @@ which defines head for loadable modules blocking notifier chain. The `BLOCKING_N
So we may see that it takes name of a name of a head of a blocking notifier chain and initializes read/write [semaphore](https://0xax.gitbook.io/linux-insides/summary/syncprim/linux-sync-3) and set head to `NULL`. Besides the `BLOCKING_INIT_NOTIFIER_HEAD` macro, the Linux kernel additionally provides `ATOMIC_INIT_NOTIFIER_HEAD`, `RAW_INIT_NOTIFIER_HEAD` macros and `srcu_init_notifier` function for initialization atomic and other types of notification chains.
-After initialization of a head of a notification chain, a subsystem which wants to receive notification from the given notification chain it should register with certain function which is depends on type of notification. If you will look in the [include/linux/notifier.h](https://github.com/torvalds/linux/blob/master/include/linux/notifier.h) header file, you will see following four function for this:
+After initialization of a head of a notification chain, a subsystem which wants to receive notification from the given notification chain should register with certain function which depends on the type of notification. If you will look in the [include/linux/notifier.h](https://github.com/torvalds/linux/blob/master/include/linux/notifier.h) header file, you will see following four function for this:
```C
extern int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
@@ -247,7 +247,7 @@ int __blocking_notifier_call_chain(struct blocking_notifier_head *nh,
}
```
-Where `nr_to_call` and `nr_calls` are number of notifier functions to be called and number of sent notifications. As you may guess the main goal of the `__blocking_notifer_call_chain` function and other functions for other notification types is to call callback function when an event occurred. Implementation of the `__blocking_notifier_call_chain` is pretty simple, it just calls the `notifier_call_chain` function from the same source code file protected with read/write semaphore:
+Where `nr_to_call` and `nr_calls` are number of notifier functions to be called and number of sent notifications. As you may guess the main goal of the `__blocking_notifer_call_chain` function and other functions for other notification types is to call callback function when an event occurs. Implementation of the `__blocking_notifier_call_chain` is pretty simple, it just calls the `notifier_call_chain` function from the same source code file protected with read/write semaphore:
```C
int __blocking_notifier_call_chain(struct blocking_notifier_head *nh,
@@ -266,7 +266,7 @@ int __blocking_notifier_call_chain(struct blocking_notifier_head *nh,
}
```
-and returns its result. In this case all job is done by the `notifier_call_chain` function. Main purpose of this function informs registered notifiers about an asynchronous event:
+and returns its result. In this case all job is done by the `notifier_call_chain` function. Main purpose of this function is to inform registered notifiers about an asynchronous event:
```C
static int notifier_call_chain(struct notifier_block **nl,
@@ -298,7 +298,7 @@ definition of the `module_notify_list` in the [kernel/module.c](https://github.c
* MODULE_STATE_COMING
* MODULE_STATE_GOING
-in which maybe interested some subsystems of the Linux kernel. For example tracing of kernel modules states. Instead of direct call of the `atomic_notifier_chain_register`, `blocking_notifier_chain_register` and etc., most notification chains come with a set of wrappers used to register to them. Registatrion on these modules events is going with the help of such wrapper:
+in which maybe interested some subsystems of the Linux kernel. For example tracing of kernel modules states. Instead of direct call of the `atomic_notifier_chain_register`, `blocking_notifier_chain_register` and etc., most notification chains come with a set of wrappers used to register to them. Registration on these modules events is going with the help of such wrapper:
```C
int register_module_notifier(struct notifier_block *nb)
@@ -348,7 +348,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
}
```
-Thus when one of these system call will be called from userspace, the Linux kernel will send certain notification depends on a system call and the `tracepoint_module_notify` callback function will be called.
+Thus when one of these system call will be called from userspace, the Linux kernel will send certain notification depending on a system call and the `tracepoint_module_notify` callback function will be called.
That's all.
diff --git a/DataStructures/linux-datastructures-2.md b/DataStructures/linux-datastructures-2.md
index cd6b90fe..58c3e2ad 100644
--- a/DataStructures/linux-datastructures-2.md
+++ b/DataStructures/linux-datastructures-2.md
@@ -4,7 +4,7 @@ Data Structures in the Linux Kernel
Radix tree
--------------------------------------------------------------------------------
-As you already know linux kernel provides many different libraries and functions which implement different data structures and algorithms. In this part we will consider one of these data structures - [Radix tree](http://en.wikipedia.org/wiki/Radix_tree). There are two files which are related to `radix tree` implementation and API in the linux kernel:
+As you already know Linux kernel provides many different libraries and functions which implement different data structures and algorithms. In this part we will consider one of these data structures - [Radix tree](http://en.wikipedia.org/wiki/Radix_tree). There are two files which are related to `radix tree` implementation and API in the linux kernel:
* [include/linux/radix-tree.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/radix-tree.h)
* [lib/radix-tree.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/lib/radix-tree.c)
@@ -43,7 +43,7 @@ Lets talk about what a `radix tree` is. Radix tree is a `compressed trie` where
So in this example, we can see the `trie` with keys, `go` and `cat`. The compressed trie or `radix tree` differs from `trie` in that all intermediates nodes which have only one child are removed.
-Radix tree in linux kernel is the data structure which maps values to integer keys. It is represented by the following structures from the file [include/linux/radix-tree.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/radix-tree.h):
+Radix tree in Linux kernel is the data structure which maps values to integer keys. It is represented by the following structures from the file [include/linux/radix-tree.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/radix-tree.h):
```C
struct radix_tree_root {
@@ -98,7 +98,7 @@ This structure contains information about the offset in a parent and height from
* `rcu_head` - used for freeing a node;
* `private_list` - used by the user of a tree;
-The two last fields of the `radix_tree_node` - `tags` and `slots` are important and interesting. Every node can contains a set of slots which are store pointers to the data. Empty slots in the linux kernel radix tree implementation store `NULL`. Radix trees in the linux kernel also supports tags which are associated with the `tags` fields in the `radix_tree_node` structure. Tags allow individual bits to be set on records which are stored in the radix tree.
+The two last fields of the `radix_tree_node` - `tags` and `slots` are important and interesting. Every node can contains a set of slots which are store pointers to the data. Empty slots in the Linux kernel radix tree implementation store `NULL`. Radix trees in the linux kernel also supports tags which are associated with the `tags` fields in the `radix_tree_node` structure. Tags allow individual bits to be set on records which are stored in the radix tree.
Now that we know about radix tree structure, it is time to look on its API.
diff --git a/DataStructures/linux-datastructures-3.md b/DataStructures/linux-datastructures-3.md
index c7015f65..ffca3f26 100644
--- a/DataStructures/linux-datastructures-3.md
+++ b/DataStructures/linux-datastructures-3.md
@@ -9,11 +9,11 @@ Besides different [linked](https://en.wikipedia.org/wiki/Linked_data_structure)
* [lib/bitmap.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/lib/bitmap.c)
* [include/linux/bitmap.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/bitmap.h)
-Besides these two files, there is also architecture-specific header file which provides optimized bit operations for certain architecture. We consider [x86_64](https://en.wikipedia.org/wiki/X86-64) architecture, so in our case it will be:
+Besides these two files, there is also architecture-specific header file which provides optimized bit operations for certain architecture. We consider [x86_64](https://en.wikipedia.org/wiki/X86-64) architecture, so in our case it will be:
* [arch/x86/include/asm/bitops.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/bitops.h)
-header file. As I just wrote above, the `bitmap` is heavily used in the Linux kernel. For example a `bit array` is used to store set of online/offline processors for systems which support [hot-plug](https://www.kernel.org/doc/Documentation/cpu-hotplug.txt) cpu (more about this you can read in the [cpumasks](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-2) part), a `bit array` stores set of allocated [irqs](https://en.wikipedia.org/wiki/Interrupt_request_%28PC_architecture%29) during initialization of the Linux kernel and etc.
+header file. As I just wrote above, the `bitmap` is heavily used in the Linux kernel. For example a `bit array` is used to store set of online/offline processors for systems which support [hot-plug](https://www.kernel.org/doc/Documentation/cpu-hotplug.txt) CPU (more about this you can read in the [cpumasks](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-2) part), a `bit array` stores set of allocated [IRQs](https://en.wikipedia.org/wiki/Interrupt_request_%28PC_architecture%29) during initialization of the Linux kernel and etc.
So, the main goal of this part is to see how `bit arrays` are implemented in the Linux kernel. Let's start.
@@ -75,7 +75,7 @@ I think that there is no need to explain what these function do. This is already
In simple words atomic operations guarantees that two or more operations will not be performed on the same data concurrently. The `x86` architecture provides a set of atomic instructions, for example [xchg](http://x86.renejeschke.de/html/file_module_x86_id_328.html) instruction, [cmpxchg](http://x86.renejeschke.de/html/file_module_x86_id_41.html) instruction and etc. Besides atomic instructions, some of non-atomic instructions can be made atomic with the help of the [lock](http://x86.renejeschke.de/html/file_module_x86_id_159.html) instruction. It is enough to know about atomic operations for now, so we can begin to consider implementation of `set_bit` and `clear_bit` functions.
-First of all, let's start to consider `non-atomic` variants of this function. Names of non-atomic `set_bit` and `clear_bit` starts from double underscore. As we already know, all of these functions are defined in the [arch/x86/include/asm/bitops.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/bitops.h) header file and the first function is `__set_bit`:
+First of all, let's start to consider `non-atomic` variants of this function. Names of non-atomic `set_bit` and `clear_bit` starts with double underscore. As we already know, all of these functions are defined in the [arch/x86/include/asm/bitops.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/bitops.h) header file and the first function is `__set_bit`:
```C
static inline void __set_bit(long nr, volatile unsigned long *addr)
@@ -134,13 +134,13 @@ which means that this function will be always inlined to reduce size of the Linu
#define IS_IMMEDIATE(nr) (__builtin_constant_p(nr))
```
-The `__builtin_constant_p` builtin function returns `1` if the given parameter is known to be constant at compile-time and returns `0` in other case. We no need to use slow `bts` instruction to set bit if the given number of bit is known in compile time constant. We can just apply [bitwise or](https://en.wikipedia.org/wiki/Bitwise_operation#OR) for byte from the give address which contains given bit and masked number of bits where high bit is `1` and other is zero. In other case if the given number of bit is not known constant at compile-time, we do the same as we did in the `__set_bit` function. The `CONST_MASK_ADDR` macro:
+The `__builtin_constant_p` builtin function returns `1` if the given parameter is known to be constant at compile-time and returns `0` in other case. We do not need to use slow `bts` instruction to set bit if the given number of bit is known in compile time constant. We can just apply [bitwise or](https://en.wikipedia.org/wiki/Bitwise_operation#OR) for byte from the give address which contains given bit and masked number of bits where high bit is `1` and other is zero. In other case if the given number of bit is not known constant at compile-time, we do the same as we did in the `__set_bit` function. The `CONST_MASK_ADDR` macro:
```C
#define CONST_MASK_ADDR(nr, addr) BITOP_ADDR((void *)(addr) + ((nr)>>3))
```
-expands to the give address with offset to the byte which contains a given bit. For example we have address `0x1000` and the number of bit is `0x9`. So, as `0x9` is `one byte + one bit` our address with be `addr + 1`:
+expands to the given address with offset to the byte which contains a given bit. For example we have address `0x1000` and the number of bit is `0x9`. So, as `0x9` is `one byte + one bit` our address with be `addr + 1`:
```python
>>> hex(0x1000 + (0x9 >> 3))
@@ -204,7 +204,7 @@ and as we can see it is very similar on `set_bit` and just contains two differen
That's all. Now we can set and clear bit in any bit array and and we can go to other operations on bitmasks.
-Most widely used operations on a bit arrays are set and clear bit in a bit array in the Linux kernel. But besides this operations it is useful to do additional operations on a bit array. Yet another widely used operation in the Linux kernel - is to know is a given bit set or not in a bit array. We can achieve this with the help of the `test_bit` macro. This macro is defined in the [arch/x86/include/asm/bitops.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/bitops.h) header file and expands to the call of the `constant_test_bit` or `variable_test_bit` depends on bit number:
+Most widely used operations on a bit arrays are set and clear bit in a bit array in the Linux kernel. But besides this operations it is useful to do additional operations on a bit array. Yet another widely used operation in the Linux kernel - is to know if a given bit is set or not in a bit array. We can achieve this with the help of the `test_bit` macro. This macro is defined in the [arch/x86/include/asm/bitops.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/bitops.h) header file and expands to the call of the `constant_test_bit` or `variable_test_bit` depending on bit number:
```C
#define test_bit(nr, addr) \
@@ -257,7 +257,7 @@ static inline void __change_bit(long nr, volatile unsigned long *addr)
}
```
-Pretty easy, is not it? The implementation of the `__change_bit` is the same as `__set_bit`, but instead of `bts` instruction, we are using [btc](http://x86.renejeschke.de/html/file_module_x86_id_23.html). This instruction selects a given bit from a given bit array, stores its value in the `CF` and changes its value by the applying of complement operation. So, a bit with value `1` will be `0` and vice versa:
+Pretty easy, is it not? The implementation of the `__change_bit` is the same as `__set_bit`, but instead of `bts` instruction, we are using [btc](http://x86.renejeschke.de/html/file_module_x86_id_23.html). This instruction selects a given bit from a given bit array, stores its value in the `CF` and changes its value by the applying of complement operation. So, a bit with value `1` will be `0` and vice versa:
```python
>>> int(not 1)
@@ -290,7 +290,7 @@ For this moment we know the most important architecture-specific operations with
Common bit operations
================================================================================
-Besides the architecture-specific API from the [arch/x86/include/asm/bitops.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/bitops.h) header file, the Linux kernel provides common API for manipulation of bit arrays. As we know from the beginning of this part, we can find it in the [include/linux/bitmap.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/bitmap.h) header file and additionally in the * [lib/bitmap.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/lib/bitmap.c) source code file. But before these source code files let's look into the [include/linux/bitops.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/bitops.h) header file which provides a set of useful macro. Let's look on some of they.
+Besides the architecture-specific API from the [arch/x86/include/asm/bitops.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/bitops.h) header file, the Linux kernel provides common API for manipulation of bit arrays. As we know from the beginning of this part, we can find it in the [include/linux/bitmap.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/bitmap.h) header file and additionally in the [lib/bitmap.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/lib/bitmap.c) source code file. But before these source code files let's look into the [include/linux/bitops.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/bitops.h) header file which provides a set of useful macro. Let's look on some of them.
First of all let's look at following four macros:
@@ -317,7 +317,7 @@ The next [header](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2
* `bitmap_zero`;
* `bitmap_fill`.
-To clear a bit array and fill it with `1`. Let's look on the implementation of the `bitmap_zero` function:
+To clear a bit array or fill it with `1`. Let's look at the implementation of the `bitmap_zero` function:
```C
static inline void bitmap_zero(unsigned long *dst, unsigned int nbits)
@@ -340,7 +340,7 @@ First of all we can see the check for `nbits`. The `small_const_nbits` is macro
As we may see it checks that `nbits` is known constant in compile time and `nbits` value does not overflow `BITS_PER_LONG` or `64`. If bits number does not overflow amount of bits in a `long` value we can just set to zero. In other case we need to calculate how many `long` values do we need to fill our bit array and fill it with [memset](http://man7.org/linux/man-pages/man3/memset.3.html).
-The implementation of the `bitmap_fill` function is similar on implementation of the `biramp_zero` function, except we fill a given bit array with `0xff` values or `0b11111111`:
+The implementation of the `bitmap_fill` function is similar on implementation of the `bitmap_zero` function, except we fill a given bit array with `0xff` values or `0b11111111`:
```C
static inline void bitmap_fill(unsigned long *dst, unsigned int nbits)
@@ -354,7 +354,7 @@ static inline void bitmap_fill(unsigned long *dst, unsigned int nbits)
}
```
-Besides the `bitmap_fill` and `bitmap_zero` functions, the [include/linux/bitmap.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/bitmap.h) header file provides `bitmap_copy` which is similar on the `bitmap_zero`, but just uses [memcpy](http://man7.org/linux/man-pages/man3/memcpy.3.html) instead of [memset](http://man7.org/linux/man-pages/man3/memset.3.html). Also it provides bitwise operations for bit array like `bitmap_and`, `bitmap_or`, `bitamp_xor` and etc. We will not consider implementation of these functions because it is easy to understand implementations of these functions if you understood all from this part. Anyway if you are interested how did these function implemented, you may open [include/linux/bitmap.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/bitmap.h) header file and start to research.
+Besides the `bitmap_fill` and `bitmap_zero` functions, the [include/linux/bitmap.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/bitmap.h) header file provides `bitmap_copy` which is similar on the `bitmap_zero`, but just uses [memcpy](http://man7.org/linux/man-pages/man3/memcpy.3.html) instead of [memset](http://man7.org/linux/man-pages/man3/memset.3.html). Also it provides bitwise operations for bit array like `bitmap_and`, `bitmap_or`, `bitamp_xor` and etc. We will not consider implementation of these functions because it is easy to understand implementations of these functions if you understood all from this part. Anyway if you are interested in how these function are implemented, you may open [include/linux/bitmap.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/bitmap.h) header file and start to research.
That's all.
@@ -363,7 +363,7 @@ Links
* [bitmap](https://en.wikipedia.org/wiki/Bit_array)
* [linked data structures](https://en.wikipedia.org/wiki/Linked_data_structure)
-* [tree data structures](https://en.wikipedia.org/wiki/Tree_%28data_structure%29)
+* [tree data structures](https://en.wikipedia.org/wiki/Tree_%28data_structure%29)
* [hot-plug](https://www.kernel.org/doc/Documentation/cpu-hotplug.txt)
* [cpumasks](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-2)
* [IRQs](https://en.wikipedia.org/wiki/Interrupt_request_%28PC_architecture%29)
@@ -377,7 +377,7 @@ Links
* [bt instruction](http://x86.renejeschke.de/html/file_module_x86_id_22.html)
* [sbb instruction](http://x86.renejeschke.de/html/file_module_x86_id_286.html)
* [btc instruction](http://x86.renejeschke.de/html/file_module_x86_id_23.html)
-* [man memcpy](http://man7.org/linux/man-pages/man3/memcpy.3.html)
+* [man memcpy](http://man7.org/linux/man-pages/man3/memcpy.3.html)
* [man memset](http://man7.org/linux/man-pages/man3/memset.3.html)
* [CF](https://en.wikipedia.org/wiki/FLAGS_register)
* [inline assembler](https://en.wikipedia.org/wiki/Inline_assembler)
diff --git a/Dockerfile b/Dockerfile
index 48b772b3..bd0ae9ac 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,3 +1,10 @@
-FROM lrx0014/gitbook:3.2.3
+FROM kyselejsyrecek/gitbook:3.2.3
COPY ./ /srv/gitbook/
-EXPOSE 4000
\ No newline at end of file
+EXPOSE 4000
+WORKDIR /srv/gitbook
+CMD ["sh", "-c", "/usr/local/bin/gitbook serve"]
+
+# Examples:
+#RUN gitbook pdf
+#RUN gitbook epub
+
diff --git a/Initialization/README.md b/Initialization/README.md
index da9dde69..6422b9e7 100644
--- a/Initialization/README.md
+++ b/Initialization/README.md
@@ -13,4 +13,4 @@ You will find here a couple of posts which describe the full cycle of kernel ini
* [The End of the architecture-specific initializations, almost...](linux-initialization-7.md) - describes the end of the `setup_arch` related stuff.
* [Scheduler initialization](linux-initialization-8.md) - describes preparation before scheduler initialization and initialization of it.
* [RCU initialization](linux-initialization-9.md) - describes the initialization of the [RCU](http://en.wikipedia.org/wiki/Read-copy-update).
-* [End of the initialization](linux-initialization-10.md) - the last part about linux kernel initialization.
+* [End of the initialization](linux-initialization-10.md) - the last part about Linux kernel initialization.
diff --git a/Initialization/images/CONFIG_NR_CPUS.png b/Initialization/images/CONFIG_NR_CPUS.png
index 996f0293..5e36552b 100644
Binary files a/Initialization/images/CONFIG_NR_CPUS.png and b/Initialization/images/CONFIG_NR_CPUS.png differ
diff --git a/Initialization/images/NX.png b/Initialization/images/NX.png
index 059e26d7..9fdcd460 100644
Binary files a/Initialization/images/NX.png and b/Initialization/images/NX.png differ
diff --git a/Initialization/images/brk_area.png b/Initialization/images/brk_area.png
index e79b7d11..4a5ea54a 100644
Binary files a/Initialization/images/brk_area.png and b/Initialization/images/brk_area.png differ
diff --git a/Initialization/images/kernel_command_line.png b/Initialization/images/kernel_command_line.png
index e65f9b21..9ff8c7d0 100644
Binary files a/Initialization/images/kernel_command_line.png and b/Initialization/images/kernel_command_line.png differ
diff --git a/Initialization/linux-initialization-1.md b/Initialization/linux-initialization-1.md
index 598fba16..9ca18e74 100644
--- a/Initialization/linux-initialization-1.md
+++ b/Initialization/linux-initialization-1.md
@@ -674,7 +674,7 @@ The next step will be setup of the early `IDT` handlers, but it's big concept so
Conclusion
--------------------------------------------------------------------------------
-This is the end of the first part about linux kernel initialization.
+This is the end of the first part about Linux kernel initialization.
If you have questions or suggestions, feel free to ping me in twitter [0xAX](https://twitter.com/0xAX), drop me [email](mailto:anotherworldofworld@gmail.com) or just create [issue](https://github.com/0xAX/linux-insides/issues/new).
diff --git a/Initialization/linux-initialization-10.md b/Initialization/linux-initialization-10.md
index 3636737e..016f76cf 100644
--- a/Initialization/linux-initialization-10.md
+++ b/Initialization/linux-initialization-10.md
@@ -1,10 +1,10 @@
Kernel initialization. Part 10.
================================================================================
-End of the linux kernel initialization process
+End of the Linux kernel initialization process
================================================================================
-This is tenth part of the chapter about linux kernel [initialization process](https://0xax.gitbook.io/linux-insides/summary/initialization) and in the [previous part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-9) we saw the initialization of the [RCU](http://en.wikipedia.org/wiki/Read-copy-update) and stopped on the call of the `acpi_early_init` function. This part will be the last part of the [Kernel initialization process](https://0xax.gitbook.io/linux-insides/summary/initialization) chapter, so let's finish it.
+This is tenth part of the chapter about Linux kernel [initialization process](https://0xax.gitbook.io/linux-insides/summary/initialization) and in the [previous part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-9) we saw the initialization of the [RCU](http://en.wikipedia.org/wiki/Read-copy-update) and stopped on the call of the `acpi_early_init` function. This part will be the last part of the [Kernel initialization process](https://0xax.gitbook.io/linux-insides/summary/initialization) chapter, so let's finish it.
After the call of the `acpi_early_init` function from the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c), we can see the following code:
@@ -132,12 +132,12 @@ structure from the [include/uapi/linux/resource.h](https://github.com/torvalds/l
```C
cat /proc/self/limits
-Limit Soft Limit Hard Limit Units
+Limit Soft Limit Hard Limit Units
...
...
...
-Max processes 63815 63815 processes
-Max pending signals 63815 63815 signals
+Max processes 63815 63815 processes
+Max pending signals 63815 63815 signals
...
...
...
@@ -149,7 +149,7 @@ Initialization of the caches
The next function after the `fork_init` is the `proc_caches_init` from the [kernel/fork.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/fork.c). This function allocates caches for the memory descriptors (or `mm_struct` structure). At the beginning of the `proc_caches_init` we can see allocation of the different [SLAB](http://en.wikipedia.org/wiki/Slab_allocation) caches with the call of the `kmem_cache_create`:
* `sighand_cachep` - manage information about installed signal handlers;
-* `signal_cachep` - manage information about process signal descriptor;
+* `signal_cachep` - manage information about process signal descriptor;
* `files_cachep` - manage information about opened files;
* `fs_cachep` - manage filesystem information.
@@ -185,7 +185,7 @@ nrpages = (nr_free_buffer_pages() * 10) / 100;
max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
```
-which will be equal to the `10%` of the `ZONE_NORMAL` (all RAM from the 4GB on the `x86_64`). The next function after the `buffer_init` is - `vfs_caches_init`. This function allocates `SLAB` caches and hashtable for different [VFS](http://en.wikipedia.org/wiki/Virtual_file_system) caches. We already saw the `vfs_caches_init_early` function in the eighth part of the linux kernel [initialization process](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-8) which initialized caches for `dcache` (or directory-cache) and [inode](http://en.wikipedia.org/wiki/Inode) cache. The `vfs_caches_init` function makes post-early initialization of the `dcache` and `inode` caches, private data cache, hash tables for the mount points, etc. More details about [VFS](http://en.wikipedia.org/wiki/Virtual_file_system) will be described in the separate part. After this we can see `signals_init` function. This function is defined in the [kernel/signal.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/signal.c) and allocates a cache for the `sigqueue` structures which represents queue of the real time signals. The next function is `page_writeback_init`. This function initializes the ratio for the dirty pages. Every low-level page entry contains the `dirty` bit which indicates whether a page has been written to after been loaded into memory.
+which will be equal to the `10%` of the `ZONE_NORMAL` (all RAM from the 4GB on the `x86_64`). The next function after the `buffer_init` is - `vfs_caches_init`. This function allocates `SLAB` caches and hashtable for different [VFS](http://en.wikipedia.org/wiki/Virtual_file_system) caches. We already saw the `vfs_caches_init_early` function in the eighth part of the Linux kernel [initialization process](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-8) which initialized caches for `dcache` (or directory-cache) and [inode](http://en.wikipedia.org/wiki/Inode) cache. The `vfs_caches_init` function makes post-early initialization of the `dcache` and `inode` caches, private data cache, hash tables for the mount points, etc. More details about [VFS](http://en.wikipedia.org/wiki/Virtual_file_system) will be described in the separate part. After this we can see `signals_init` function. This function is defined in the [kernel/signal.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/signal.c) and allocates a cache for the `sigqueue` structures which represents queue of the real time signals. The next function is `page_writeback_init`. This function initializes the ratio for the dirty pages. Every low-level page entry contains the `dirty` bit which indicates whether a page has been written to after been loaded into memory.
Creation of the root for the procfs
--------------------------------------------------------------------------------
@@ -230,9 +230,9 @@ and a couple of directories depends on the different configuration options:
In the end of the `proc_root_init` we call the `proc_sys_init` function which creates `/proc/sys` directory and initializes the [Sysctl](http://en.wikipedia.org/wiki/Sysctl).
-It is the end of `start_kernel` function. I did not describe all functions which are called in the `start_kernel`. I skipped them, because they are not important for the generic kernel initialization stuff and depend on only different kernel configurations. They are `taskstats_init_early` which exports per-task statistic to the user-space, `delayacct_init` - initializes per-task delay accounting, `key_init` and `security_init` initialize different security stuff, `check_bugs` - fix some architecture-dependent bugs, `ftrace_init` function executes initialization of the [ftrace](https://www.kernel.org/doc/Documentation/trace/ftrace.txt), `cgroup_init` makes initialization of the rest of the [cgroup](http://en.wikipedia.org/wiki/Cgroups) subsystem,etc. Many of these parts and subsystems will be described in the other chapters.
+It is the end of `start_kernel` function. I did not describe all functions which are called in the `start_kernel`. I skipped them, because they are not important for the generic kernel initialization stuff and depend on only different kernel configurations. They are `taskstats_init_early` which exports per-task statistic to the user-space, `delayacct_init` - initializes per-task delay accounting, `key_init` and `security_init` initialize different security stuff, `check_bugs` - fix some architecture-dependent bugs, `ftrace_init` function executes initialization of the [ftrace](https://www.kernel.org/doc/Documentation/trace/ftrace.txt), `cgroup_init` makes initialization of the rest of the [cgroup](http://en.wikipedia.org/wiki/Cgroups) subsystem, etc. Many of these parts and subsystems will be described in the other chapters.
-That's all. Finally we have passed through the long-long `start_kernel` function. But it is not the end of the linux kernel initialization process. We haven't run the first process yet. In the end of the `start_kernel` we can see the last call of the - `rest_init` function. Let's go ahead.
+That's all. Finally we have passed through the long-long `start_kernel` function. But it is not the end of the Linux kernel initialization process. We haven't run the first process yet. In the end of the `start_kernel` we can see the last call of the - `rest_init` function. Let's go ahead.
First steps after the start_kernel
--------------------------------------------------------------------------------
@@ -251,7 +251,7 @@ kernel_thread(kernel_init, NULL, CLONE_FS);
pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);
```
-Here the `kernel_thread` function (defined in the [kernel/fork.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/fork.c)) creates new kernel thread.As we can see the `kernel_thread` function takes three arguments:
+Here the `kernel_thread` function (defined in the [kernel/fork.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/fork.c)) creates new kernel thread. As we can see the `kernel_thread` function takes three arguments:
* Function which will be executed in a new thread;
* Parameter for the `kernel_init` function;
@@ -271,7 +271,7 @@ Let's postpone `kernel_init` and `kthreadd` for now and go ahead in the `rest_in
kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns);
rcu_read_unlock();
```
-
+
The first `rcu_read_lock` function marks the beginning of an [RCU](http://en.wikipedia.org/wiki/Read-copy-update) read-side critical section and the `rcu_read_unlock` marks the end of an RCU read-side critical section. We call these functions because we need to protect the `find_task_by_pid_ns`. The `find_task_by_pid_ns` returns pointer to the `task_struct` by the given pid. So, here we are getting the pointer to the `task_struct` for `PID = 2` (we got it after `kthreadd` creation with the `kernel_thread`). In the next step we call `complete` function
```C
@@ -314,7 +314,7 @@ void init_idle_bootup_task(struct task_struct *idle)
}
```
-where `idle` class is a low task priority and tasks can be run only when the processor doesn't have anything to run besides this tasks. The second function `schedule_preempt_disabled` disables preempt in `idle` tasks. And the third function `cpu_startup_entry` is defined in the [kernel/sched/idle.c](https://github.com/torvalds/linux/blob/master/kernel/sched/idle.c) and calls `cpu_idle_loop` from the [kernel/sched/idle.c](https://github.com/torvalds/linux/blob/master/kernel/sched/idle.c). The `cpu_idle_loop` function works as process with `PID = 0` and works in the background. Main purpose of the `cpu_idle_loop` is to consume the idle CPU cycles. When there is no process to run, this process starts to work. We have one process with `idle` scheduling class (we just set the `current` task to the `idle` with the call of the `init_idle_bootup_task` function), so the `idle` thread does not do useful work but just checks if there is an active task to switch to:
+where `idle` class is a low task priority and tasks can be run only when the processor doesn't have anything to run besides this tasks. The second function `schedule_preempt_disabled` disables preempt in `idle` tasks. And the third function `cpu_startup_entry` is defined in the [kernel/sched/idle.c](https://github.com/torvalds/linux/blob/master/kernel/sched/idle.c) and calls `cpu_idle_loop` from the [kernel/sched/idle.c](https://github.com/torvalds/linux/blob/master/kernel/sched/idle.c). The `cpu_idle_loop` function works as process with `PID = 0` and works in the background. Main purpose of the `cpu_idle_loop` is to consume the idle CPU cycles. When there is no process to run, this process starts to work. We have one process with `idle` scheduling class (we just set the `current` task to the `idle` with the call of the `init_idle_bootup_task` function), so the `idle` thread does not do useful work but just checks if there is an active task to switch to:
```C
static void cpu_idle_loop(void)
@@ -418,7 +418,7 @@ The `do_execve` function is defined in the [include/linux/sched.h](https://githu
}
```
-If we did not pass `init=` kernel command line parameter either, kernel tries to run one of the following executable files:
+If we did not pass `init=` kernel command line parameter either, kernel tries to run one of the following executable files:
```C
if (!try_to_run_init_process("/sbin/init") ||
@@ -440,7 +440,7 @@ That's all! Linux kernel initialization process is finished!
Conclusion
--------------------------------------------------------------------------------
-It is the end of the tenth part about the linux kernel [initialization process](https://0xax.gitbook.io/linux-insides/summary/initialization). It is not only the `tenth` part, but also is the last part which describes initialization of the linux kernel. As I wrote in the first [part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-1) of this chapter, we will go through all steps of the kernel initialization and we did it. We started at the first architecture-independent function - `start_kernel` and finished with the launch of the first `init` process in the our system. I skipped details about different subsystem of the kernel, for example I almost did not cover scheduler, interrupts, exception handling, etc. From the next part we will start to dive to the different kernel subsystems. Hope it will be interesting.
+It is the end of the tenth part about the Linux kernel [initialization process](https://0xax.gitbook.io/linux-insides/summary/initialization). It is not only the `tenth` part, but also is the last part which describes initialization of the linux kernel. As I wrote in the first [part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-1) of this chapter, we will go through all steps of the kernel initialization and we did it. We started at the first architecture-independent function - `start_kernel` and finished with the launch of the first `init` process in the our system. I skipped details about different subsystem of the kernel, for example I almost did not cover scheduler, interrupts, exception handling, etc. From the next part we will start to dive to the different kernel subsystems. Hope it will be interesting.
If you have any questions or suggestions write me a comment or ping me at [twitter](https://twitter.com/0xAX).
diff --git a/Initialization/linux-initialization-2.md b/Initialization/linux-initialization-2.md
index 2b26205f..3e9bcbea 100644
--- a/Initialization/linux-initialization-2.md
+++ b/Initialization/linux-initialization-2.md
@@ -612,7 +612,7 @@ All what this function does is just returns `1` if the exception is generated be
Conclusion
--------------------------------------------------------------------------------
-This is the end of the second part about linux kernel insides. If you have questions or suggestions, ping me in twitter [0xAX](https://twitter.com/0xAX), drop me [email](mailto:anotherworldofworld@gmail.com) or just create [issue](https://github.com/0xAX/linux-insides/issues/new). In the next part we will see all steps before kernel entry point - `start_kernel` function.
+This is the end of the second part about Linux kernel insides. If you have questions or suggestions, ping me in twitter [0xAX](https://twitter.com/0xAX), drop me [email](mailto:anotherworldofworld@gmail.com) or just create [issue](https://github.com/0xAX/linux-insides/issues/new). In the next part we will see all steps before kernel entry point - `start_kernel` function.
**Please note that English is not my first language and I am really sorry for any inconvenience. If you found any mistakes please send me PR to [linux-insides](https://github.com/0xAX/linux-insides).**
diff --git a/Initialization/linux-initialization-3.md b/Initialization/linux-initialization-3.md
index 0b86270b..ad916852 100644
--- a/Initialization/linux-initialization-3.md
+++ b/Initialization/linux-initialization-3.md
@@ -4,7 +4,7 @@ Kernel initialization. Part 3.
Last preparations before the kernel entry point
--------------------------------------------------------------------------------
-This is the third part of the Linux kernel initialization process series. In the previous [part](https://github.com/0xAX/linux-insides/blob/master/Initialization/linux-initialization-2.md) we saw early interrupt and exception handling and will continue to dive into the linux kernel initialization process in the current part. Our next point is 'kernel entry point' - `start_kernel` function from the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c) source code file. Yes, technically it is not kernel's entry point but the start of the generic kernel code which does not depend on certain architecture. But before we call the `start_kernel` function, we must do some preparations. So let's continue.
+This is the third part of the Linux kernel initialization process series. In the previous [part](https://github.com/0xAX/linux-insides/blob/master/Initialization/linux-initialization-2.md) we saw early interrupt and exception handling and will continue to dive into the Linux kernel initialization process in the current part. Our next point is 'kernel entry point' - `start_kernel` function from the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c) source code file. Yes, technically it is not kernel's entry point but the start of the generic kernel code which does not depend on certain architecture. But before we call the `start_kernel` function, we must do some preparations. So let's continue.
boot_params again
--------------------------------------------------------------------------------
@@ -29,10 +29,10 @@ Now let's look at `__va` macro. This macro defined in [init/main.c](https://gith
#define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET))
```
-where `PAGE_OFFSET` is `__PAGE_OFFSET` which is `0xffff880000000000` and the base virtual address of the direct mapping of all physical memory. So we're getting virtual address of the `boot_params` structure and pass it to the `copy_bootdata` function, where we copy `real_mod_data` to the `boot_params` which is declared in the [arch/x86/include/asm/setup.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/setup.h)
+where `PAGE_OFFSET` is `__PAGE_OFFSET` which is `0xffff880000000000` and the base virtual address of the direct mapping of all physical memory. So we're getting virtual address of variable `boot_params` which come along from real mode, and pass it to the `copy_bootdata` function, where we copy `real_mode_data` to the `boot_params` which is defined in the [arch/x86/kernel/setup.c](https://github.com/torvalds/linux/blob/d9919d43cbf6790d2bc0c0a2743c51fc25f26919/arch/x86/kernel/setup.c)
```C
-extern struct boot_params boot_params;
+struct boot_params boot_params;
```
Let's look at the `copy_boot_data` implementation:
@@ -181,7 +181,7 @@ if (paravirt_enabled())
return;
```
-we exit from the `reserve_ebda_region` function if paravirtualization is enabled because if it enabled the extended bios data area is absent. In the next step we need to get the end of the low memory:
+we exit from the `reserve_ebda_region` function if paravirtualization is enabled because if it enabled the extended BIOS data area is absent. In the next step we need to get the end of the low memory:
```C
lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
@@ -205,7 +205,7 @@ static inline unsigned int get_bios_ebda(void)
}
```
-Let's try to understand how it works. Here we can see that we converting physical address `0x40E` to the virtual, where `0x0040:0x000e` is the segment which contains base address of the extended BIOS data area. Don't worry that we are using `phys_to_virt` function for converting a physical address to virtual address. You can note that previously we have used `__va` macro for the same point, but `phys_to_virt` is the same:
+Let's try to understand how it works. Here we can see that we are converting physical address `0x40E` to the virtual, where `0x0040:0x000e` is the segment which contains base address of the extended BIOS data area. Don't worry that we are using `phys_to_virt` function for converting a physical address to virtual address. You can note that previously we have used `__va` macro for the same point, but `phys_to_virt` is the same:
```C
static inline void *phys_to_virt(phys_addr_t address)
@@ -242,7 +242,7 @@ which is:
#define INSANE_CUTOFF 0x20000U
```
-or 128 kilobytes. In the last step we get lower part in the low memory and extended bios data area and call `memblock_reserve` function which will reserve memory region for extended bios data between low memory and one megabyte mark:
+or 128 kilobytes. In the last step we get lower part in the low memory and extended BIOS data area and call `memblock_reserve` function which will reserve memory region for extended BIOS data between low memory and one megabyte mark:
```C
lowmem = min(lowmem, ebda_addr);
@@ -255,12 +255,12 @@ memblock_reserve(lowmem, 0x100000 - lowmem);
* base physical address;
* region size.
-and reserves memory region for the given base address and size. `memblock_reserve` is the first function in this book from linux kernel memory manager framework. We will take a closer look on memory manager soon, but now let's look at its implementation.
+and reserves memory region for the given base address and size. `memblock_reserve` is the first function in this book from Linux kernel memory manager framework. We will take a closer look on memory manager soon, but now let's look at its implementation.
-First touch of the linux kernel memory manager framework
+First touch of the Linux kernel memory manager framework
--------------------------------------------------------------------------------
-In the previous paragraph we stopped at the call of the `memblock_reserve` function and as i said before it is the first function from the memory manager framework. Let's try to understand how it works. `memblock_reserve` function just calls:
+In the previous paragraph we stopped at the call of the `memblock_reserve` function and as I said before it is the first function from the memory manager framework. Let's try to understand how it works. `memblock_reserve` function just calls:
```C
memblock_reserve_region(base, size, MAX_NUMNODES, 0);
@@ -290,7 +290,7 @@ struct memblock_type {
};
```
-As we need to reserve memory block for extended bios data area, the type of the current memory region is reserved where `memblock` structure is:
+As we need to reserve memory block for extended BIOS data area, the type of the current memory region is reserved where `memblock` structure is:
```C
struct memblock {
@@ -401,7 +401,7 @@ static inline void memblock_set_region_node(struct memblock_region *r, int nid)
}
```
-After this we will have first reserved `memblock` for the extended bios data area in the `.meminit.data` section. `reserve_ebda_region` function finished its work on this step and we can go back to the [arch/x86/kernel/head64.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/head64.c).
+After this we will have first reserved `memblock` for the extended BIOS data area in the `.meminit.data` section. `reserve_ebda_region` function finished its work on this step and we can go back to the [arch/x86/kernel/head64.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/head64.c).
We finished all preparations before the kernel entry point! The last step in the `x86_64_start_reservations` function is the call of the:
@@ -416,7 +416,7 @@ That's all for this part.
Conclusion
--------------------------------------------------------------------------------
-It is the end of the third part about linux kernel insides. In next part we will see the first initialization steps in the kernel entry point - `start_kernel` function. It will be the first step before we will see launch of the first `init` process.
+It is the end of the third part about Linux kernel insides. In next part we will see the first initialization steps in the kernel entry point - `start_kernel` function. It will be the first step before we will see launch of the first `init` process.
If you have any questions or suggestions write me a comment or ping me at [twitter](https://twitter.com/0xAX).
diff --git a/Initialization/linux-initialization-4.md b/Initialization/linux-initialization-4.md
index fd21b853..f720f6d2 100644
--- a/Initialization/linux-initialization-4.md
+++ b/Initialization/linux-initialization-4.md
@@ -4,7 +4,7 @@ Kernel initialization. Part 4.
Kernel entry point
================================================================================
-If you have read the previous part - [Last preparations before the kernel entry point](https://github.com/0xAX/linux-insides/blob/master/Initialization/linux-initialization-3.md), you can remember that we finished all pre-initialization stuff and stopped right before the call to the `start_kernel` function from the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c). The `start_kernel` is the entry of the generic and architecture independent kernel code, although we will return to the `arch/` folder many times. If you look inside of the `start_kernel` function, you will see that this function is very big. For this moment it contains about `86` calls of functions. Yes, it's very big and of course this part will not cover all the processes that occur in this function. In the current part we will only start to do it. This part and all the next which will be in the [Kernel initialization process](https://github.com/0xAX/linux-insides/blob/master/Initialization/README.md) chapter will cover it.
+If you have read the previous part - [Last preparations before the kernel entry point](https://github.com/0xAX/linux-insides/blob/master/Initialization/linux-initialization-3.md), you can remember that we finished all pre-initialization stuff and stopped right before the call to the `start_kernel` function from the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c). The `start_kernel` is the entry of the generic and architecture independent kernel code, although we will return to the `arch/` folder many times. If you look inside of the `start_kernel` function, you will see that this function is very big. For this moment it contains about `86` function calls. Yes, it's very big and of course this part will not cover all the processes that occur in this function. In the current part we will only start to do it. This part and all the next which will be in the [Kernel initialization process](https://github.com/0xAX/linux-insides/blob/master/Initialization/README.md) chapter will cover it.
The main purpose of the `start_kernel` to finish kernel initialization process and launch the first `init` process. Before the first process will be started, the `start_kernel` must do many things such as: to enable [lock validator](https://www.kernel.org/doc/Documentation/locking/lockdep-design.txt), to initialize processor id, to enable early [cgroups](http://en.wikipedia.org/wiki/Cgroups) subsystem, to setup per-cpu areas, to initialize different caches in [vfs](http://en.wikipedia.org/wiki/Virtual_file_system), to initialize memory manager, rcu, vmalloc, scheduler, IRQs, ACPI and many many more. Only after these steps will we see the launch of the first `init` process in the last part of this chapter. So much kernel code awaits us, let's start.
@@ -53,7 +53,7 @@ struct task_struct init_task = INIT_TASK(init_task);
where `task_struct` stores all the information about a process. I will not explain this structure in this book because it's very big. You can find its definition in [include/linux/sched.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/sched.h#L1278). At this moment `task_struct` contains more than `100` fields! Although you will not see the explanation of the `task_struct` in this book, we will use it very often since it is the fundamental structure which describes the `process` in the Linux kernel. I will describe the meaning of the fields of this structure as we meet them in practice.
-You can see the definition of the `init_task` and it initialized by the `INIT_TASK` macro. This macro is from [include/linux/init_task.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/init_task.h) and it just fills the `init_task` with the values for the first process. For example it sets:
+You can see the definition of the `init_task` and it is initialized by the `INIT_TASK` macro. This macro is from [include/linux/init_task.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/init_task.h) and it just fills the `init_task` with the values for the first process. For example it sets:
* init process state to zero or `runnable`. A runnable process is one which is waiting only for a CPU to run on;
* init process flags - `PF_KTHREAD` which means - kernel thread;
@@ -63,7 +63,7 @@ You can see the definition of the `init_task` and it initialized by the `INIT_TA
```C
union thread_union {
- struct thread_info thread_info;
+ struct thread_info thread_info;
unsigned long stack[THREAD_SIZE/sizeof(long)];
};
```
@@ -74,7 +74,7 @@ Every process has its own stack and it is 16 kilobytes or 4 page frames in `x86_
struct thread_info {
struct task_struct *task;
struct exec_domain *exec_domain;
- __u32 flags;
+ __u32 flags;
__u32 status;
__u32 cpu;
int saved_preempt_count;
@@ -179,7 +179,7 @@ As we got the end of the `init` process stack, we write `STACK_END_MAGIC` there.
if (*end_of_stack(task) != STACK_END_MAGIC) {
//
// handle stack overflow here
- //
+ //
}
```
@@ -263,16 +263,16 @@ Remember that we have passed `cpu_number` as `pcp` to the `this_cpu_read` from t
__bad_size_call_parameter(); break; \
} \
pscr_ret__; \
-})
+})
```
-Yes, it looks a little strange but it's easy. First of all we can see the definition of the `pscr_ret__` variable with the `int` type. Why int? Ok, `variable` is `common_cpu` and it was declared as per-cpu int variable:
+Yes, it looks a little strange but it's easy. First of all we can see the definition of the `pscr_ret__` variable with the `int` type. Why int? Ok, `variable` is `cpu_number` and it was declared as per-cpu int variable:
```C
DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number);
```
-In the next step we call `__verify_pcpu_ptr` with the address of `cpu_number`. `__veryf_pcpu_ptr` used to verify that the given parameter is a per-cpu pointer. After that we set `pscr_ret__` value which depends on the size of the variable. Our `common_cpu` variable is `int`, so it 4 bytes in size. It means that we will get `this_cpu_read_4(common_cpu)` in `pscr_ret__`. In the end of the `__pcpu_size_call_return` we just call it. `this_cpu_read_4` is a macro:
+In the next step we call `__verify_pcpu_ptr` with the address of `cpu_number`. `__veryf_pcpu_ptr` used to verify that the given parameter is a per-cpu pointer. After that we set `pscr_ret__` value which depends on the size of the variable. Our `cpu_number` variable is `int`, so it's 4 bytes in size. It means that we will get `this_cpu_read_4(cpu_number)` in `pscr_ret__`. In the end of the `__pcpu_size_call_return` we just call it. `this_cpu_read_4` is a macro:
```C
#define this_cpu_read_4(pcp) percpu_from_op("mov", pcp)
@@ -281,19 +281,19 @@ In the next step we call `__verify_pcpu_ptr` with the address of `cpu_number`. `
which calls `percpu_from_op` and pass `mov` instruction and per-cpu variable there. `percpu_from_op` will expand to the inline assembly call:
```C
-asm("movl %%gs:%1,%0" : "=r" (pfo_ret__) : "m" (common_cpu))
+asm("movl %%gs:%1,%0" : "=r" (pfo_ret__) : "m" (cpu_number))
```
-Let's try to understand how it works and what it does. The `gs` segment register contains the base of per-cpu area. Here we just copy `common_cpu` which is in memory to the `pfo_ret__` with the `movl` instruction. Or with another words:
+Let's try to understand how it works and what it does. The `gs` segment register contains the base of per-cpu area. Here we just copy `cpu_number` which is in memory to the `pfo_ret__` with the `movl` instruction. Or with another words:
```C
-this_cpu_read(common_cpu)
+this_cpu_read(cpu_number)
```
is the same as:
```C
-movl %gs:$common_cpu, $pfo_ret__
+movl %gs:$cpu_number, $pfo_ret__
```
As we didn't setup per-cpu area, we have only one - for the current running CPU, we will get `zero` as a result of the `smp_processor_id`.
@@ -350,7 +350,7 @@ If you're not sure that this `set_cpu_*` operations and `cpumask` are not clear
As we activated the bootstrap processor, it's time to go to the next function in the `start_kernel.` Now it is `page_address_init`, but this function does nothing in our case, because it executes only when all `RAM` can't be mapped directly.
-Print linux banner
+Print Linux banner
---------------------------------------------------------------------------------
The next call is `pr_notice`:
diff --git a/Initialization/linux-initialization-5.md b/Initialization/linux-initialization-5.md
index 95c26f0e..ad0fd09a 100644
--- a/Initialization/linux-initialization-5.md
+++ b/Initialization/linux-initialization-5.md
@@ -31,7 +31,7 @@ We already saw implementation of the `set_intr_gate` in the previous part about
* base address of the interrupt/exception handler;
* third parameter is - `Interrupt Stack Table`. `IST` is a new mechanism in the `x86_64` and part of the [TSS](http://en.wikipedia.org/wiki/Task_state_segment). Every active thread in kernel mode has own kernel stack which is `16` kilobytes. While a thread in user space, this kernel stack is empty.
-In addition to per-thread stacks, there are a couple of specialized stacks associated with each CPU. All about these stack you can read in the linux kernel documentation - [Kernel stacks](https://www.kernel.org/doc/Documentation/x86/kernel-stacks). `x86_64` provides feature which allows to switch to a new `special` stack for during any events as non-maskable interrupt and etc... And the name of this feature is - `Interrupt Stack Table`. There can be up to 7 `IST` entries per CPU and every entry points to the dedicated stack. In our case this is `DEBUG_STACK`.
+In addition to per-thread stacks, there are a couple of specialized stacks associated with each CPU. All about these stack you can read in the Linux kernel documentation - [Kernel stacks](https://www.kernel.org/doc/Documentation/x86/kernel-stacks). `x86_64` provides feature which allows to switch to a new `special` stack for during any events as non-maskable interrupt and etc... And the name of this feature is - `Interrupt Stack Table`. There can be up to 7 `IST` entries per CPU and every entry points to the dedicated stack. In our case this is `DEBUG_STACK`.
`set_intr_gate_ist` and `set_system_intr_gate_ist` work by the same principle as `set_intr_gate` with only one difference. Both of these functions checks
interrupt number and call `_set_gate` inside:
@@ -43,12 +43,12 @@ _set_gate(n, GATE_INTERRUPT, addr, 0, ist, __KERNEL_CS);
as `set_intr_gate` does this. But `set_intr_gate` calls `_set_gate` with [dpl](http://en.wikipedia.org/wiki/Privilege_level) - 0, and ist - 0, but `set_intr_gate_ist` and `set_system_intr_gate_ist` sets `ist` as `DEBUG_STACK` and `set_system_intr_gate_ist` sets `dpl` as `0x3` which is the lowest privilege. When an interrupt occurs and the hardware loads such a descriptor, then hardware automatically sets the new stack pointer based on the IST value, then invokes the interrupt handler. All of the special kernel stacks will be set in the `cpu_init` function (we will see it later).
-As `#DB` and `#BP` gates written to the `idt_descr`, we reload `IDT` table with `load_idt` which just call `ldtr` instruction. Now let's look on interrupt handlers and will try to understand how they works. Of course, I can't cover all interrupt handlers in this book and I do not see the point in this. It is very interesting to delve in the linux kernel source code, so we will see how `debug` handler implemented in this part, and understand how other interrupt handlers are implemented will be your task.
+As `#DB` and `#BP` gates written to the `idt_descr`, we reload `IDT` table with `load_idt` which just call `ldtr` instruction. Now let's look on interrupt handlers and will try to understand how they works. Of course, I can't cover all interrupt handlers in this book and I do not see the point in this. It is very interesting to delve in the Linux kernel source code, so we will see how `debug` handler implemented in this part, and understand how other interrupt handlers are implemented will be your task.
#DB handler
--------------------------------------------------------------------------------
-As you can read above, we passed address of the `#DB` handler as `&debug` in the `set_intr_gate_ist`. [lxr.free-electrons.com](http://lxr.free-electrons.com/ident) is a great resource for searching identifiers in the linux kernel source code, but unfortunately you will not find `debug` handler with it. All of you can find, it is `debug` definition in the [arch/x86/include/asm/traps.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/traps.h):
+As you can read above, we passed address of the `#DB` handler as `&debug` in the `set_intr_gate_ist`. [lxr.free-electrons.com](http://lxr.free-electrons.com/ident) is a great resource for searching identifiers in the Linux kernel source code, but unfortunately you will not find `debug` handler with it. All of you can find, it is `debug` definition in the [arch/x86/include/asm/traps.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/traps.h):
```C
asmlinkage void debug(void);
@@ -163,9 +163,9 @@ The next step is initialization of early `ioremap`. In general there are two way
* I/O Ports;
* Device memory.
-We already saw first method (`outb/inb` instructions) in the part about linux kernel booting [process](https://0xax.gitbook.io/linux-insides/summary/booting/linux-bootstrap-3). The second method is to map I/O physical addresses to virtual addresses. When a physical address is accessed by the CPU, it may refer to a portion of physical RAM which can be mapped on memory of the I/O device. So `ioremap` used to map device memory into kernel address space.
+We already saw first method (`outb/inb` instructions) in the part about Linux kernel booting [process](https://0xax.gitbook.io/linux-insides/summary/booting/linux-bootstrap-3). The second method is to map I/O physical addresses to virtual addresses. When a physical address is accessed by the CPU, it may refer to a portion of physical RAM which can be mapped on memory of the I/O device. So `ioremap` used to map device memory into kernel address space.
-As i wrote above next function is the `early_ioremap_init` which re-maps I/O memory to kernel address space so it can access it. We need to initialize early ioremap for early initialization code which needs to temporarily map I/O or memory regions before the normal mapping functions like `ioremap` are available. Implementation of this function is in the [arch/x86/mm/ioremap.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/mm/ioremap.c). At the start of the `early_ioremap_init` we can see definition of the `pmd` pointer with `pmd_t` type (which presents page middle directory entry `typedef struct { pmdval_t pmd; } pmd_t;` where `pmdval_t` is `unsigned long`) and make a check that `fixmap` aligned in a correct way:
+As I wrote above next function is the `early_ioremap_init` which re-maps I/O memory to kernel address space so it can access it. We need to initialize early ioremap for early initialization code which needs to temporarily map I/O or memory regions before the normal mapping functions like `ioremap` are available. Implementation of this function is in the [arch/x86/mm/ioremap.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/mm/ioremap.c). At the start of the `early_ioremap_init` we can see definition of the `pmd` pointer with `pmd_t` type (which presents page middle directory entry `typedef struct { pmdval_t pmd; } pmd_t;` where `pmdval_t` is `unsigned long`) and make a check that `fixmap` aligned in a correct way:
```C
pmd_t *pmd;
@@ -198,7 +198,7 @@ After early `ioremap` was initialized, you can see the following code:
ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
```
-This code obtains major and minor numbers for the root device where `initrd` will be mounted later in the `do_mount_root` function. Major number of the device identifies a driver associated with the device. Minor number referred on the device controlled by driver. Note that `old_decode_dev` takes one parameter from the `boot_params_structure`. As we can read from the x86 linux kernel boot protocol:
+This code obtains major and minor numbers for the root device where `initrd` will be mounted later in the `do_mount_root` function. Major number of the device identifies a driver associated with the device. Minor number referred on the device controlled by driver. Note that `old_decode_dev` takes one parameter from the `boot_params_structure`. As we can read from the x86 Linux kernel boot protocol:
```
Field name: root_dev
@@ -298,7 +298,7 @@ presents abstraction for a tree-like subset of system resources. This structure
|
+-------------+
| |
-| child |
+| child |
| |
+-------------+
```
@@ -408,7 +408,7 @@ static inline void __init copy_edd(void)
Memory descriptor initialization
--------------------------------------------------------------------------------
-The next step is initialization of the memory descriptor of the init process. As you already can know every process has its own address space. This address space presented with special data structure which called `memory descriptor`. Directly in the linux kernel source code memory descriptor presented with `mm_struct` structure. `mm_struct` contains many different fields related with the process address space as start/end address of the kernel code/data, start/end of the brk, number of memory areas, list of memory areas and etc... This structure defined in the [include/linux/mm_types.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/mm_types.h). As every process has its own memory descriptor, `task_struct` structure contains it in the `mm` and `active_mm` field. And our first `init` process has it too. You can remember that we saw the part of initialization of the init `task_struct` with `INIT_TASK` macro in the previous [part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-4):
+The next step is initialization of the memory descriptor of the init process. As you already can know every process has its own address space. This address space presented with special data structure which called `memory descriptor`. Directly in the Linux kernel source code memory descriptor presented with `mm_struct` structure. `mm_struct` contains many different fields related with the process address space as start/end address of the kernel code/data, start/end of the brk, number of memory areas, list of memory areas and etc... This structure defined in the [include/linux/mm_types.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/mm_types.h). As every process has its own memory descriptor, `task_struct` structure contains it in the `mm` and `active_mm` field. And our first `init` process has it too. You can remember that we saw the part of initialization of the init `task_struct` with `INIT_TASK` macro in the previous [part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-4):
```C
#define INIT_TASK(tsk) \
@@ -422,7 +422,7 @@ The next step is initialization of the memory descriptor of the init process. As
}
```
-`mm` points to the process address space and `active_mm` points to the active address space if process has no address space such as kernel threads (more about it you can read in the [documentation](https://www.kernel.org/doc/Documentation/vm/active_mm.txt)). Now we fill memory descriptor of the initial process:
+`mm` points to the process address space and `active_mm` points to the active address space if process has no address space such as kernel threads (more about it you can read in the [documentation](https://www.kernel.org/doc/Documentation/vm/active_mm.txt)). Now we fill memory descriptor of the initial process:
```C
init_mm.start_code = (unsigned long) _text;
@@ -457,7 +457,7 @@ where `mm_rb` is a red-black tree of the virtual memory areas, `pgd` is a pointe
bss_resource.end = __pa_symbol(__bss_stop)-1;
```
-We already know a little about `resource` structure (read above). Here we fills code/data/bss resources with their physical addresses. You can see it in the `/proc/iomem`:
+We already know a little about `resource` structure (read above). Here we fill code/data/bss resources with their physical addresses. You can see it in the `/proc/iomem`:
```C
00100000-be825fff : System RAM
@@ -477,7 +477,7 @@ static struct resource code_resource = {
};
```
-The last step which we will cover in this part will be `NX` configuration. `NX-bit` or no execute bit is 63-bit in the page directory entry which controls the ability to execute code from all physical pages mapped by the table entry. This bit can only be used/set when the `no-execute` page-protection mechanism is enabled by the setting `EFER.NXE` to 1. In the `x86_configure_nx` function we check that CPU has support of `NX-bit` and it does not disabled. After the check we fill `__supported_pte_mask` depend on it:
+The last step which we will cover in this part will be `NX` configuration. `NX-bit` or no execute bit is 63-bit in the page directory entry which controls the ability to execute code from all physical pages mapped by the table entry. This bit can only be used/set when the `no-execute` page-protection mechanism is enabled by the setting `EFER.NXE` to 1. In the `x86_configure_nx` function we check that CPU has support of `NX-bit` and it does not disabled. After the check we fill `__supported_pte_mask` depend on it:
```C
void x86_configure_nx(void)
@@ -492,7 +492,7 @@ void x86_configure_nx(void)
Conclusion
--------------------------------------------------------------------------------
-It is the end of the fifth part about linux kernel initialization process. In this part we continued to dive in the `setup_arch` function which makes initialization of architecture-specific stuff. It was long part, but we have not finished with it. As i already wrote, the `setup_arch` is big function, and I am really not sure that we will cover all of it even in the next part. There were some new interesting concepts in this part like `Fix-mapped` addresses, ioremap and etc... Don't worry if they are unclear for you. There is a special part about these concepts - [Linux kernel memory management Part 2.](https://github.com/0xAX/linux-insides/blob/master/MM/linux-mm-2.md). In the next part we will continue with the initialization of the architecture-specific stuff and will see parsing of the early kernel parameters, early dump of the pci devices, `Desktop Management Interface` scanning and many many more.
+It is the end of the fifth part about Linux kernel initialization process. In this part we continued to dive in the `setup_arch` function which makes initialization of architecture-specific stuff. It was long part, but we are not finished with it. As I already wrote, the `setup_arch` is big function, and I am really not sure that we will cover all of it even in the next part. There were some new interesting concepts in this part like `Fix-mapped` addresses, ioremap and etc... Don't worry if they are unclear for you. There is a special part about these concepts - [Linux kernel memory management Part 2.](https://github.com/0xAX/linux-insides/blob/master/MM/linux-mm-2.md). In the next part we will continue with the initialization of the architecture-specific stuff and will see parsing of the early kernel parameters, early dump of the pci devices, `Desktop Management Interface` scanning and many many more.
If you have any questions or suggestions write me a comment or ping me at [twitter](https://twitter.com/0xAX).
@@ -504,7 +504,7 @@ Links
* [mm vs active_mm](https://www.kernel.org/doc/Documentation/vm/active_mm.txt)
* [e820](http://en.wikipedia.org/wiki/E820)
* [Supervisor mode access prevention](https://lwn.net/Articles/517475/)
-* [Kernel stacks](https://www.kernel.org/doc/Documentation/x86/x86_64/kernel-stacks)
+* [Kernel stacks](https://www.kernel.org/doc/Documentation/x86/kernel-stacks)
* [TSS](http://en.wikipedia.org/wiki/Task_state_segment)
* [IDT](http://en.wikipedia.org/wiki/Interrupt_descriptor_table)
* [Memory mapped I/O](http://en.wikipedia.org/wiki/Memory-mapped_I/O)
diff --git a/Initialization/linux-initialization-6.md b/Initialization/linux-initialization-6.md
index 43fd5fa1..d15279db 100644
--- a/Initialization/linux-initialization-6.md
+++ b/Initialization/linux-initialization-6.md
@@ -4,7 +4,7 @@ Kernel initialization. Part 6.
Architecture-specific initialization, again...
================================================================================
-In the previous [part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-5) we saw architecture-specific (`x86_64` in our case) initialization stuff from the [arch/x86/kernel/setup.c](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/setup.c) and finished on `x86_configure_nx` function which sets the `_PAGE_NX` flag depends on support of [NX bit](http://en.wikipedia.org/wiki/NX_bit). As I wrote before `setup_arch` function and `start_kernel` are very big, so in this and in the next part we will continue to learn about architecture-specific initialization process. The next function after `x86_configure_nx` is `parse_early_param`. This function is defined in the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c) and as you can understand from its name, this function parses kernel command line and setups different services depends on the given parameters (all kernel command line parameters you can find are in the [Documentation/kernel-parameters.txt](https://github.com/torvalds/linux/blob/master/Documentation/admin-guide/kernel-parameters.rst)). You may remember how we setup `earlyprintk` in the earliest [part](https://0xax.gitbook.io/linux-insides/summary/booting/linux-bootstrap-2). On the early stage we looked for kernel parameters and their value with the `cmdline_find_option` function and `__cmdline_find_option`, `__cmdline_find_option_bool` helpers from the [arch/x86/boot/cmdline.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/boot/cmdline.c). There we're in the generic kernel part which does not depend on architecture and here we use another approach. If you are reading linux kernel source code, you already note calls like this:
+In the previous [part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-5) we saw architecture-specific (`x86_64` in our case) initialization stuff from the [arch/x86/kernel/setup.c](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/setup.c) and finished on `x86_configure_nx` function which sets the `_PAGE_NX` flag depends on support of [NX bit](http://en.wikipedia.org/wiki/NX_bit). As I wrote before `setup_arch` function and `start_kernel` are very big, so in this and in the next part we will continue to learn about architecture-specific initialization process. The next function after `x86_configure_nx` is `parse_early_param`. This function is defined in the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c) and as you can understand from its name, this function parses kernel command line and setups different services depends on the given parameters (all kernel command line parameters you can find are in the [Documentation/kernel-parameters.txt](https://github.com/torvalds/linux/blob/master/Documentation/admin-guide/kernel-parameters.rst)). You may remember how we setup `earlyprintk` in the earliest [part](https://0xax.gitbook.io/linux-insides/summary/booting/linux-bootstrap-2). On the early stage we looked for kernel parameters and their value with the `cmdline_find_option` function and `__cmdline_find_option`, `__cmdline_find_option_bool` helpers from the [arch/x86/boot/cmdline.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/boot/cmdline.c). There we're in the generic kernel part which does not depend on architecture and here we use another approach. If you are reading Linux kernel source code, you already note calls like this:
```C
early_param("gbpages", parse_direct_gbpages_on);
@@ -60,7 +60,7 @@ Note that `__set_param` macro defines with `__section(.init.setup)` attribute. I
VMLINUX_SYMBOL(__setup_end) = .;
```
-Now we know how parameters are defined, let's back to the `parse_early_param` implementation:
+Now we know how parameters are defined, let's back to the `parse_early_param` implementation:
```C
void __init parse_early_param(void)
@@ -128,7 +128,7 @@ int __init acpi_mps_check(void)
}
```
-It checks the built-in `MPS` or [MultiProcessor Specification](http://en.wikipedia.org/wiki/MultiProcessor_Specification) table. If `CONFIG_X86_LOCAL_APIC` is set and `CONFIG_x86_MPPAARSE` is not set, `acpi_mps_check` prints warning message if the one of the command line options: `acpi=off`, `acpi=noirq` or `pci=noacpi` passed to the kernel. If `acpi_mps_check` returns `1` it means that we disable local [APIC](http://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller) and clear `X86_FEATURE_APIC` bit in the of the current CPU with the `setup_clear_cpu_cap` macro. (more about CPU mask you can read in the [CPU masks](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-2)).
+It checks the built-in `MPS` or [MultiProcessor Specification](http://en.wikipedia.org/wiki/MultiProcessor_Specification) table. If `CONFIG_X86_LOCAL_APIC` is set and `CONFIG_x86_MPPARSE` is not set, `acpi_mps_check` prints warning message if the one of the command line options: `acpi=off`, `acpi=noirq` or `pci=noacpi` passed to the kernel. If `acpi_mps_check` returns `1` it means that we disable local [APIC](http://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller) and clear `X86_FEATURE_APIC` bit in the of the current CPU with the `setup_clear_cpu_cap` macro. (more about CPU mask you can read in the [CPU masks](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-2)).
Early PCI dump
--------------------------------------------------------------------------------
@@ -165,7 +165,7 @@ char *__init pcibios_setup(char *str) {
}
```
-So, if `CONFIG_PCI` option is set and we passed `pci=earlydump` option to the kernel command line, next function which will be called - `early_dump_pci_devices` from the [arch/x86/pci/early.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/pci/early.c). This function checks `noearly` pci parameter with:
+So, if `CONFIG_PCI` option is set and we passed `pci=earlydump` option to the kernel command line, next function which will be called - `early_dump_pci_devices` from the [arch/x86/pci/early.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/pci/early.c). This function checks `noearly` PCI parameter with:
```C
if (!early_pci_allowed())
@@ -208,7 +208,7 @@ After the `early_dump_pci_devices`, there are a couple of function related with
early_reserve_e820_mpc_new();
```
-Let's look on it. As you can see the first function is `e820_reserve_setup_data`. This function does almost the same as `memblock_x86_reserve_range_setup_data` which we saw above, but it also calls `e820_update_range` which adds new regions to the `e820map` with the given type which is `E820_RESERVED_KERN` in our case. The next function is `finish_e820_parsing` which sanitizes `e820map` with the `sanitize_e820_map` function. Besides this two functions we can see a couple of functions related to the [e820](http://en.wikipedia.org/wiki/E820). You can see it in the listing above. `e820_add_kernel_range` function takes the physical address of the kernel start and end:
+Let's look at it. As you can see the first function is `e820_reserve_setup_data`. This function does almost the same as `memblock_x86_reserve_range_setup_data` which we saw above, but it also calls `e820_update_range` which adds new regions to the `e820map` with the given type which is `E820_RESERVED_KERN` in our case. The next function is `finish_e820_parsing` which sanitizes `e820map` with the `sanitize_e820_map` function. Besides this two functions we can see a couple of functions related to the [e820](http://en.wikipedia.org/wiki/E820). You can see it in the listing above. `e820_add_kernel_range` function takes the physical address of the kernel start and end:
```C
u64 start = __pa_symbol(_text);
@@ -273,13 +273,13 @@ if (max_pfn > (1UL<<(32 - PAGE_SHIFT)))
max_low_pfn = e820_end_of_low_ram_pfn();
else
max_low_pfn = max_pfn;
-
+
high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
```
Next we calculate `high_memory` (defines the upper bound on direct map memory) with `__va` macro which returns a virtual address by the given physical memory.
-DMI scanning
+DMI scanning
-------------------------------------------------------------------------------
The next step after manipulations with different memory regions and `e820` slots is collecting information about computer. We will get all information with the [Desktop Management Interface](http://en.wikipedia.org/wiki/Desktop_Management_Interface) and following functions:
@@ -356,7 +356,7 @@ RESERVE_BRK(dmi_alloc, 65536);
#endif
```
-`RESERVE_BRK` defined in the [arch/x86/include/asm/setup.h](http://en.wikipedia.org/wiki/Desktop_Management_Interface) and reserves space with given size in the `brk` section.
+`RESERVE_BRK` defined in the [arch/x86/include/asm/setup.h](http://github.com/torvalds/linux/blob/master/arch/x86/include/asm/setup.h) and reserves space with given size in the `brk` section.
-------------------------
init_hypervisor_platform();
@@ -451,7 +451,7 @@ void __init early_alloc_pgt_buf(void)
}
```
-First of all it get the size of the page table buffer, it will be `INIT_PGT_BUF_SIZE` which is `(6 * PAGE_SIZE)` in the current linux kernel 4.0. As we got the size of the page table buffer, we call `extend_brk` function with two parameters: size and align. As you can understand from its name, this function extends the `brk` area. As we can see in the linux kernel linker script `brk` is in memory right after the [BSS](http://en.wikipedia.org/wiki/.bss):
+First of all it get the size of the page table buffer, it will be `INIT_PGT_BUF_SIZE` which is `(6 * PAGE_SIZE)` in the current Linux kernel 4.0. As we got the size of the page table buffer, we call `extend_brk` function with two parameters: size and align. As you can understand from its name, this function extends the `brk` area. As we can see in the linux kernel linker script `brk` is in memory right after the [BSS](http://en.wikipedia.org/wiki/.bss):
```C
. = ALIGN(PAGE_SIZE);
@@ -517,12 +517,12 @@ MEMBLOCK configuration:
The rest functions after the `memblock_x86_fill` are: `early_reserve_e820_mpc_new` allocates additional slots in the `e820map` for MultiProcessor Specification table, `reserve_real_mode` - reserves low memory from `0x0` to 1 megabyte for the trampoline to the real mode (for rebooting, etc.), `trim_platform_memory_ranges` - trims certain memory regions started from `0x20050000`, `0x20110000`, etc. these regions must be excluded because [Sandy Bridge](http://en.wikipedia.org/wiki/Sandy_Bridge) has problems with these regions, `trim_low_memory_range` reserves the first 4 kilobyte page in `memblock`, `init_mem_mapping` function reconstructs direct memory mapping and setups the direct mapping of the physical memory at `PAGE_OFFSET`, `early_trap_pf_init` setups `#PF` handler (we will look on it in the chapter about interrupts) and `setup_real_mode` function setups trampoline to the [real mode](http://en.wikipedia.org/wiki/Real_mode) code.
-That's all. You can note that this part will not cover all functions which are in the `setup_arch` (like `early_gart_iommu_check`, [mtrr](http://en.wikipedia.org/wiki/Memory_type_range_register) initialization, etc.). As I already wrote many times, `setup_arch` is big, and linux kernel is big. That's why I can't cover every line in the linux kernel. I don't think that we missed something important, but you can say something like: each line of code is important. Yes, it's true, but I missed them anyway, because I think that it is not realistic to cover full linux kernel. Anyway we will often return to the idea that we have already seen, and if something is unfamiliar, we will cover this theme.
+That's all. You can note that this part will not cover all functions which are in the `setup_arch` (like `early_gart_iommu_check`, [mtrr](http://en.wikipedia.org/wiki/Memory_type_range_register) initialization, etc.). As I already wrote many times, `setup_arch` is big, and Linux kernel is big. That's why I can't cover every line in the linux kernel. I don't think that we missed something important, but you can say something like: each line of code is important. Yes, it's true, but I missed them anyway, because I think that it is not realistic to cover full linux kernel. Anyway we will often return to the idea that we have already seen, and if something is unfamiliar, we will cover this theme.
Conclusion
--------------------------------------------------------------------------------
-It is the end of the sixth part about linux kernel initialization process. In this part we continued to dive in the `setup_arch` function again and it was long part, but we are not finished with it. Yes, `setup_arch` is big, hope that next part will be the last part about this function.
+It is the end of the sixth part about Linux kernel initialization process. In this part we continued to dive in the `setup_arch` function again and it was long part, but we are not finished with it. Yes, `setup_arch` is big, hope that next part will be the last part about this function.
If you have any questions or suggestions write me a comment or ping me at [twitter](https://twitter.com/0xAX).
diff --git a/Initialization/linux-initialization-7.md b/Initialization/linux-initialization-7.md
index 7e02df20..8198a7c9 100644
--- a/Initialization/linux-initialization-7.md
+++ b/Initialization/linux-initialization-7.md
@@ -19,7 +19,7 @@ Now let's look on the implementation of the `setup_log_buf` function. It starts
```C
if (log_buf != __log_buf)
return;
-
+
if (!early && !new_log_buf_len)
log_buf_add_cpu();
```
@@ -41,7 +41,7 @@ if (ramdisk_size >= (mapped_size>>1))
"disabling initrd (%lld needed, %lld available)\n",
ramdisk_size, mapped_size>>1);
```
-
+
You can see here that we call `memblock_mem_size` function and pass the `max_pfn_mapped` to it, where `max_pfn_mapped` contains the highest direct mapped page frame number. If you do not remember what is `page frame number`, explanation is simple: First `12` bits of the virtual address represent offset in the physical page or page frame. If we right-shift out `12` bits of the virtual address, we'll discard offset part and will get `Page Frame Number`. In the `memblock_mem_size` we go through the all memblock `mem` (not reserved) regions and calculates size of the mapped pages and return it to the `mapped_size` variable (see code above). As we got amount of the direct mapped memory, we check that size of the `initrd` is not greater than mapped pages. If it is greater we just call `panic` which halts the system and prints famous [Kernel panic](http://en.wikipedia.org/wiki/Kernel_panic) message. In the next step we print information about the `initrd` size. We can see the result of this in the `dmesg` output:
```C
@@ -183,13 +183,13 @@ function. The `cma_declare_contiguous` reserves contiguous area from the given b
Initialization of the sparse memory
--------------------------------------------------------------------------------
-The next step is the call of the function - `x86_init.paging.pagetable_init`. If you try to find this function in the linux kernel source code, in the end of your search, you will see the following macro:
+The next step is the call of the function - `x86_init.paging.pagetable_init`. If you try to find this function in the Linux kernel source code, in the end of your search, you will see the following macro:
```C
#define native_pagetable_init paging_init
```
-which expands as you can see to the call of the `paging_init` function from the [arch/x86/mm/init_64.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/mm/init_64.c). The `paging_init` function initializes sparse memory and zone sizes. First of all what's zones and what is it `Sparsemem`. The `Sparsemem` is a special foundation in the linux kernel memory manager which used to split memory area into different memory banks in the [NUMA](http://en.wikipedia.org/wiki/Non-uniform_memory_access) systems. Let's look on the implementation of the `paginig_init` function:
+which expands as you can see to the call of the `paging_init` function from the [arch/x86/mm/init_64.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/mm/init_64.c). The `paging_init` function initializes sparse memory and zone sizes. First of all what's zones and what is it `Sparsemem`. The `Sparsemem` is a special foundation in the Linux kernel memory manager which used to split memory area into different memory banks in the [NUMA](http://en.wikipedia.org/wiki/Non-uniform_memory_access) systems. Let's look on the implementation of the `paging_init` function:
```C
void __init paging_init(void)
@@ -209,7 +209,7 @@ As you can see there is call of the `sparse_memory_present_with_active_regions`
Again, this part and next parts do not cover this theme in full details. There will be special part about `NUMA`.
-vsyscall mapping
+vsyscall mapping
--------------------------------------------------------------------------------
The next step after `SparseMem` initialization is setting of the `trampoline_cr4_features` which must contain content of the `cr4` [Control register](http://en.wikipedia.org/wiki/Control_register). First of all we need to check that current CPU has support of the `cr4` register and if it has, we save its content to the `trampoline_cr4_features` which is storage for `cr4` in the real mode:
@@ -315,7 +315,7 @@ struct mpf_intel *mpf = mpf_found;
if (!mpf)
return;
-
+
if (acpi_lapic && early)
return;
```
@@ -325,7 +325,7 @@ Here we can see that multiprocessor configuration was found in the `smp_scan_con
The rest of the setup_arch
--------------------------------------------------------------------------------
-Here we are getting to the end of the `setup_arch` function. The rest of function of course is important, but details about these stuff will not will not be included in this part. We will just take a short look on these functions, because although they are important as I wrote above, but they cover non-generic kernel features related with the `NUMA`, `SMP`, `ACPI` and `APICs`, etc. First of all, the next call of the `init_apic_mappings` function. As we can understand this function sets the address of the local [APIC](http://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller). The next is `x86_io_apic_ops.init` and this function initializes I/O APIC. Please note that we will see all details related with `APIC` in the chapter about interrupts and exceptions handling. In the next step we reserve standard I/O resources like `DMA`, `TIMER`, `FPU`, etc., with the call of the `x86_init.resources.reserve_resources` function. Following is `mcheck_init` function initializes `Machine check Exception` and the last is `register_refined_jiffies` which registers [jiffy](http://en.wikipedia.org/wiki/Jiffy_%28time%29) (There will be separate chapter about timers in the kernel).
+Here we are getting to the end of the `setup_arch` function. The rest of function of course is important, but details about these stuff will not will not be included in this part. We will just take a short look on these functions, because although they are important as I wrote above, they cover non-generic kernel features related with the `NUMA`, `SMP`, `ACPI` and `APICs`, etc. First of all, the next call of the `init_apic_mappings` function. As we can understand this function sets the address of the local [APIC](http://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller). The next is `x86_io_apic_ops.init` and this function initializes I/O APIC. Please note that we will see all details related with `APIC` in the chapter about interrupts and exceptions handling. In the next step we reserve standard I/O resources like `DMA`, `TIMER`, `FPU`, etc., with the call of the `x86_init.resources.reserve_resources` function. Following is `mcheck_init` function initializes `Machine check Exception` and the last is `register_refined_jiffies` which registers [jiffy](http://en.wikipedia.org/wiki/Jiffy_%28time%29) (There will be separate chapter about timers in the kernel).
So that's all. Finally we have finished with the big `setup_arch` function in this part. Of course as I already wrote many times, we did not see full details about this function, but do not worry about it. We will be back more than once to this function from different chapters for understanding how different platform-dependent parts are initialized.
@@ -360,7 +360,7 @@ This function takes pointer to the kernel command line allocates a couple of buf
* `initcall_command_line` - will contain boot command line. will be used in the `do_initcall_level`;
* `static_command_line` - will contain command line for parameters parsing.
-We will allocate space with the `memblock_virt_alloc` function. This function calls `memblock_virt_alloc_try_nid` which allocates boot memory block with `memblock_reserve` if [slab](http://en.wikipedia.org/wiki/Slab_allocation) is not available or uses `kzalloc_node` (more about it will be in the linux memory management chapter). The `memblock_virt_alloc` uses `BOOTMEM_LOW_LIMIT` (physical address of the `(PAGE_OFFSET + 0x1000000)` value) and `BOOTMEM_ALLOC_ACCESSIBLE` (equal to the current value of the `memblock.current_limit`) as minimum address of the memory region and maximum address of the memory region.
+We will allocate space with the `memblock_virt_alloc` function. This function calls `memblock_virt_alloc_try_nid` which allocates boot memory block with `memblock_reserve` if [slab](http://en.wikipedia.org/wiki/Slab_allocation) is not available or uses `kzalloc_node` (more about it will be in the Linux memory management chapter). The `memblock_virt_alloc` uses `BOOTMEM_LOW_LIMIT` (physical address of the `(PAGE_OFFSET + 0x1000000)` value) and `BOOTMEM_ALLOC_ACCESSIBLE` (equal to the current value of the `memblock.current_limit`) as minimum address of the memory region and maximum address of the memory region.
Let's look on the implementation of the `setup_command_line`:
@@ -458,7 +458,7 @@ That's all.
Conclusion
================================================================================
-It is the end of the seventh part about the linux kernel initialization process. In this part, finally we have finished with the `setup_arch` function and returned to the `start_kernel` function. In the next part we will continue to learn generic kernel code from the `start_kernel` and will continue our way to the first `init` process.
+It is the end of the seventh part about the Linux kernel initialization process. In this part, finally we have finished with the `setup_arch` function and returned to the `start_kernel` function. In the next part we will continue to learn generic kernel code from the `start_kernel` and will continue our way to the first `init` process.
If you have any questions or suggestions write me a comment or ping me at [twitter](https://twitter.com/0xAX).
diff --git a/Initialization/linux-initialization-8.md b/Initialization/linux-initialization-8.md
index 9b74cf6e..1964f5df 100644
--- a/Initialization/linux-initialization-8.md
+++ b/Initialization/linux-initialization-8.md
@@ -75,7 +75,7 @@ static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
The `get_cpu_gdt_table` uses `per_cpu` macro for getting value of a `gdt_page` percpu variable for the given CPU number (bootstrap processor with `id` - 0 in our case).
-You may ask the following question: so, if we can access `gdt_page` percpu variable, where it was defined? Actually we already saw it in this book. If you have read the first [part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-1) of this chapter, you can remember that we saw definition of the `gdt_page` in the [arch/x86/kernel/head_64.S](https://github.com/0xAX/linux/blob/0a07b238e5f488b459b6113a62e06b6aab017f71/arch/x86/kernel/head_64.S):
+You may ask the following question: so, if we can access `gdt_page` percpu variable, where was it defined? Actually we already saw it in this book. If you have read the first [part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-1) of this chapter, you can remember that we saw definition of the `gdt_page` in the [arch/x86/kernel/head_64.S](https://github.com/0xAX/linux/blob/0a07b238e5f488b459b6113a62e06b6aab017f71/arch/x86/kernel/head_64.S):
```assembly
early_gdt_descr:
@@ -117,29 +117,29 @@ void load_percpu_segment(int cpu) {
}
```
-The base address of the `percpu` area must contain `gs` register (or `fs` register for `x86`), so we are using `loadsegment` macro and pass `gs`. In the next step we writes the base address if the [IRQ](http://en.wikipedia.org/wiki/Interrupt_request_%28PC_architecture%29) stack and setup stack [canary](http://en.wikipedia.org/wiki/Buffer_overflow_protection) (this is only for `x86_32`). After we load new `GDT`, we fill `cpu_callout_mask` bitmap with the current cpu and set cpu state as online with the setting `cpu_state` percpu variable for the current processor - `CPU_ONLINE`:
+The base address of the `percpu` area must contain `gs` register (or `fs` register for `x86`), so we are using `loadsegment` macro and pass `gs`. In the next step we write the base address if the [IRQ](http://en.wikipedia.org/wiki/Interrupt_request_%28PC_architecture%29) stack and setup stack [canary](http://en.wikipedia.org/wiki/Buffer_overflow_protection) (this is only for `x86_32`). After we load new `GDT`, we fill `cpu_callout_mask` bitmap with the current cpu and set cpu state as online with the setting `cpu_state` percpu variable for the current processor - `CPU_ONLINE`:
```C
cpumask_set_cpu(me, cpu_callout_mask);
per_cpu(cpu_state, me) = CPU_ONLINE;
```
-So, what is `cpu_callout_mask` bitmap... As we initialized bootstrap processor (processor which is booted the first on `x86`) the other processors in a multiprocessor system are known as `secondary processors`. Linux kernel uses following two bitmasks:
+So, what is `cpu_callout_mask` bitmap? As we initialized bootstrap processor (processor which is booted the first on `x86`) the other processors in a multiprocessor system are known as `secondary processors`. Linux kernel uses following two bitmasks:
* `cpu_callout_mask`
* `cpu_callin_mask`
-After bootstrap processor initialized, it updates the `cpu_callout_mask` to indicate which secondary processor can be initialized next. All other or secondary processors can do some initialization stuff before and check the `cpu_callout_mask` on the bootstrap processor bit. Only after the bootstrap processor filled the `cpu_callout_mask` with this secondary processor, it will continue the rest of its initialization. After that the certain processor finish its initialization process, the processor sets bit in the `cpu_callin_mask`. Once the bootstrap processor finds the bit in the `cpu_callin_mask` for the current secondary processor, this processor repeats the same procedure for initialization of one of the remaining secondary processors. In a short words it works as i described, but we will see more details in the chapter about `SMP`.
-
+After bootstrap processor initialized, it updates the `cpu_callout_mask` to indicate which secondary processor can be initialized next. All other or secondary processors can do some initialization stuff before and check the `cpu_callout_mask` on the bootstrap processor bit. Only after the bootstrap processor filled the `cpu_callout_mask` with this secondary processor, it will continue the rest of its initialization. After that the certain processor finish its initialization process, the processor sets bit in the `cpu_callin_mask`. Once the bootstrap processor finds the bit in the `cpu_callin_mask` for the current secondary processor, this processor repeats the same procedure for initialization of one of the remaining secondary processors. In a short words it works as I described, but we will see more details in the chapter about `SMP`.
+
That's all. We did all `SMP` boot preparation.
Build zonelists
-----------------------------------------------------------------------
-In the next step we can see the call of the `build_all_zonelists` function. This function sets up the order of zones that allocations are preferred from. What are zones and what's order we will understand soon. For the start let's see how linux kernel considers physical memory. Physical memory is split into banks which are called - `nodes`. If you has no hardware support for `NUMA`, you will see only one node:
+In the next step we can see the call of the `build_all_zonelists` function. This function sets up the order of zones that allocations are preferred from. What are zones and what's order we will understand soon. For the start let's see how Linux kernel considers physical memory. Physical memory is split into banks which are called - `nodes`. If you have no hardware support for `NUMA`, you will see only one node:
```
-$ cat /sys/devices/system/node/node0/numastat
+$ cat /sys/devices/system/node/node0/numastat
numa_hit 72452442
numa_miss 0
numa_foreign 0
@@ -148,7 +148,7 @@ local_node 72452442
other_node 0
```
-Every `node` is presented by the `struct pglist_data` in the linux kernel. Each node is divided into a number of special blocks which are called - `zones`. Every zone is presented by the `zone struct` in the linux kernel and has one of the type:
+Every `node` is presented by the `struct pglist_data` in the Linux kernel. Each node is divided into a number of special blocks which are called - `zones`. Every zone is presented by the `zone struct` in the linux kernel and has one of the type:
* `ZONE_DMA` - 0-16M;
* `ZONE_DMA32` - used for 32 bit devices that can only do DMA areas below 4G;
@@ -185,13 +185,13 @@ As I wrote above all nodes are described with the `pglist_data` or `pg_data_t` s
The rest of the stuff before scheduler initialization
--------------------------------------------------------------------------------
-Before we will start to dive into linux kernel scheduler initialization process we must do a couple of things. The first thing is the `page_alloc_init` function from the [mm/page_alloc.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/mm/page_alloc.c). This function looks pretty easy:
+Before we start to dive into Linux kernel scheduler initialization process we must do a couple of things. The first thing is the `page_alloc_init` function from the [mm/page_alloc.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/mm/page_alloc.c). This function looks pretty easy:
```C
void __init page_alloc_init(void)
{
int ret;
-
+
ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC_DEAD,
"mm/page_alloc:dead", NULL,
page_alloc_cpu_dead);
@@ -205,11 +205,11 @@ After this function we can see the kernel command line in the initialization out

-And a couple of functions such as `parse_early_param` and `parse_args` which handles linux kernel command line. You may remember that we already saw the call of the `parse_early_param` function in the sixth [part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-6) of the kernel initialization chapter, so why we call it again? Answer is simple: we call this function in the architecture-specific code (`x86_64` in our case), but not all architecture calls this function. And we need to call the second function `parse_args` to parse and handle non-early command line arguments.
+And a couple of functions such as `parse_early_param` and `parse_args` which handles Linux kernel command line. You may remember that we already saw the call of the `parse_early_param` function in the sixth [part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-6) of the kernel initialization chapter, so why we call it again? Answer is simple: we call this function in the architecture-specific code (`x86_64` in our case), but not all architecture calls this function. And we need to call the second function `parse_args` to parse and handle non-early command line arguments.
In the next step we can see the call of the `jump_label_init` from the [kernel/jump_label.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/jump_label.c). and initializes [jump label](https://lwn.net/Articles/412072/).
-After this we can see the call of the `setup_log_buf` function which setups the [printk](http://www.makelinux.net/books/lkd2/ch18lev1sec3) log buffer. We already saw this function in the seventh [part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-7) of the linux kernel initialization process chapter.
+After this we can see the call of the `setup_log_buf` function which setups the [printk](http://www.makelinux.net/books/lkd2/ch18lev1sec3) log buffer. We already saw this function in the seventh [part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-7) of the Linux kernel initialization process chapter.
PID hash initialization
--------------------------------------------------------------------------------
@@ -230,7 +230,7 @@ pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
```
The number of elements of the `pid_hash` depends on the `RAM` configuration, but it can be between `2^4` and `2^12`. The `pidhash_init` computes the size
-and allocates the required storage (which is `hlist` in our case - the same as [doubly linked list](https://0xax.gitbook.io/linux-insides/summary/datastructures/linux-datastructures-1), but contains one pointer instead on the [struct hlist_head](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/types.h)]. The `alloc_large_system_hash` function allocates a large system hash table with `memblock_virt_alloc_nopanic` if we pass `HASH_EARLY` flag (as it in our case) or with `__vmalloc` if we did no pass this flag.
+and allocates the required storage (which is `hlist` in our case - the same as [doubly linked list](https://0xax.gitbook.io/linux-insides/summary/datastructures/linux-datastructures-1), but contains one pointer instead on the [struct hlist_head](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/types.h). The `alloc_large_system_hash` function allocates a large system hash table with `memblock_virt_alloc_nopanic` if we pass `HASH_EARLY` flag (as it in our case) or with `__vmalloc` if we did no pass this flag.
The result we can see in the `dmesg` output:
@@ -244,7 +244,7 @@ $ dmesg | grep hash
That's all. The rest of the stuff before scheduler initialization is the following functions: `vfs_caches_init_early` does early initialization of the [virtual file system](http://en.wikipedia.org/wiki/Virtual_file_system) (more about it will be in the chapter which will describe virtual file system), `sort_main_extable` sorts the kernel's built-in exception table entries which are between `__start___ex_table` and `__stop___ex_table`, and `trap_init` initializes trap handlers (more about last two function we will know in the separate chapter about interrupts).
-The last step before the scheduler initialization is initialization of the memory manager with the `mm_init` function from the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c). As we can see, the `mm_init` function initializes different parts of the linux kernel memory manager:
+The last step before the scheduler initialization is initialization of the memory manager with the `mm_init` function from the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c). As we can see, the `mm_init` function initializes different parts of the Linux kernel memory manager:
```C
page_ext_init_flatmem();
@@ -255,7 +255,7 @@ pgtable_init();
vmalloc_init();
```
-The first is `page_ext_init_flatmem` which depends on the `CONFIG_SPARSEMEM` kernel configuration option and initializes extended data per page handling. The `mem_init` releases all `bootmem`, the `kmem_cache_init` initializes kernel cache, the `percpu_init_late` - replaces `percpu` chunks with those allocated by [slub](http://en.wikipedia.org/wiki/SLUB_%28software%29), the `pgtable_init` - initializes the `page->ptl` kernel cache, the `vmalloc_init` - initializes `vmalloc`. Please, **NOTE** that we will not dive into details about all of these functions and concepts, but we will see all of they it in the [Linux kernel memory manager](https://0xax.gitbook.io/linux-insides/summary/mm) chapter.
+The first is `page_ext_init_flatmem` which depends on the `CONFIG_SPARSEMEM` kernel configuration option and initializes extended data per page handling. The `mem_init` releases all `bootmem`, the `kmem_cache_init` initializes kernel cache, the `percpu_init_late` - replaces `percpu` chunks with those allocated by [slub](http://en.wikipedia.org/wiki/SLUB_%28software%29), the `pgtable_init` - initializes the `page->ptl` kernel cache, the `vmalloc_init` - initializes `vmalloc`. Please, **NOTE** that we will not dive into details about all of these functions and concepts, but we will see all of them it in the [Linux kernel memory manager](https://0xax.gitbook.io/linux-insides/summary/mm) chapter.
That's all. Now we can look on the `scheduler`.
@@ -318,7 +318,7 @@ The `Completely Fair Scheduler` supports following `normal` or in other words `n
The `SCHED_NORMAL` is used for the most normal applications, the amount of cpu each process consumes is mostly determined by the [nice](http://en.wikipedia.org/wiki/Nice_%28Unix%29) value, the `SCHED_BATCH` used for the 100% non-interactive tasks and the `SCHED_IDLE` runs tasks only when the processor has no task to run besides this task.
-The `real-time` policies are also supported for the time-critical applications: `SCHED_FIFO` and `SCHED_RR`. If you've read something about the Linux kernel scheduler, you can know that it is modular. That means it supports different algorithms to schedule different types of processes. Usually this modularity is called `scheduler classes`. These modules encapsulate scheduling policy details and are handled by the scheduler core without knowing too much about them.
+The `real-time` policies are also supported for the time-critical applications: `SCHED_FIFO` and `SCHED_RR`. If you've read something about the Linux kernel scheduler, you can know that it is modular. That means it supports different algorithms to schedule different types of processes. Usually this modularity is called `scheduler classes`. These modules encapsulate scheduling policy details and are handled by the scheduler core without knowing too much about them.
Now let's get back to the our code and look on the two configuration options: `CONFIG_FAIR_GROUP_SCHED` and `CONFIG_RT_GROUP_SCHED`. The smallest unit that the scheduler works with is an individual task or thread. However, a process is not the only type of entity that the scheduler can operate with. Both of these options provide support for group scheduling. The first option provides support for group scheduling with the `completely fair scheduler` policies and the second with the `real-time` policies respectively.
@@ -340,11 +340,11 @@ The first is for case when scheduling of task groups is enabled with `completely
* scheduler entity structure;
* `runqueue`.
-After we have calculated size, we allocate a space with the `kzalloc` function and set pointers of `sched_entity` and `runquques` there:
+After we have calculated size, we allocate a space with the `kzalloc` function and set pointers of `sched_entity` and `runqueues` there:
```C
ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
-
+
#ifdef CONFIG_FAIR_GROUP_SCHED
root_task_group.se = (struct sched_entity **)ptr;
ptr += nr_cpu_ids * sizeof(void **);
@@ -396,10 +396,10 @@ All groups have to be able to rely on the amount of CPU time. The two following
The first represents a period and the second represents quantum that is allocated for `real-time` tasks during `sched_rt_period_us`. You may see global values of these parameters in the:
```
-$ cat /proc/sys/kernel/sched_rt_period_us
+$ cat /proc/sys/kernel/sched_rt_period_us
1000000
-$ cat /proc/sys/kernel/sched_rt_runtime_us
+$ cat /proc/sys/kernel/sched_rt_runtime_us
950000
```
@@ -415,7 +415,7 @@ That's all with the bandwiths of `real-time` and `deadline` tasks and in the nex
The real-time scheduler requires global resources to make scheduling decision. But unfortunately scalability bottlenecks appear as the number of CPUs increase. The concept of `root domains` was introduced for improving scalability and avoid such bottlenecks. Instead of bypassing over all `run queues`, the scheduler gets information about a CPU where/from to push/pull a `real-time` task from the `root_domain` structure. This structure is defined in the [kernel/sched/sched.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/sched/sched.h) kernel header file and just keeps track of CPUs that can be used to push or pull a process.
-After `root domain` initialization, we make initialization of the `bandwidth` for the `real-time` tasks of the `root task group` as we did the same above:
+After `root domain` initialization, we make initialization of the `bandwidth` for the `real-time` tasks of the `root task group` as we did the same above:
```C
#ifdef CONFIG_RT_GROUP_SCHED
init_rt_bandwidth(&root_task_group.rt_bandwidth,
@@ -499,7 +499,7 @@ struct task_struct {
}
```
-The first one is `dynamic priority` which can't be changed during lifetime of a process based on its static priority and interactivity of the process. The `static_prio` contains initial priority most likely well-known to you `nice value`. This value does not changed by the kernel if a user will not change it. The last one is `normal_priority` based on the value of the `static_prio` too, but also it depends on the scheduling policy of a process.
+The first one is `dynamic priority` which can't be changed during lifetime of a process based on its static priority and interactivity of the process. The `static_prio` contains initial priority most likely well-known to you `nice value`. This value is not changed by the kernel if a user does not change it. The last one is `normal_priority` based on the value of the `static_prio` too, but also it depends on the scheduling policy of a process.
So the main goal of the `set_load_weight` function is to initialize `load_weight` fields for the `init` task:
@@ -541,12 +541,12 @@ The last two steps of the `sched_init` function is to initialization of schedule
scheduler_running = 1;
```
-That's all. Linux kernel scheduler is initialized. Of course, we have skipped many different details and explanations here, because we need to know and understand how different concepts (like process and process groups, runqueue, rcu, etc.) works in the linux kernel , but we took a short look on the scheduler initialization process. We will look all other details in the separate part which will be fully dedicated to the scheduler.
+That's all. Linux kernel scheduler is initialized. Of course, we have skipped many different details and explanations here, because we need to know and understand how different concepts (like process and process groups, runqueue, rcu, etc.) works in the Linux kernel , but we took a short look on the scheduler initialization process. We will look all other details in the separate part which will be fully dedicated to the scheduler.
Conclusion
--------------------------------------------------------------------------------
-It is the end of the eighth part about the linux kernel initialization process. In this part, we looked on the initialization process of the scheduler and we will continue in the next part to dive in the linux kernel initialization process and will see initialization of the [RCU](http://en.wikipedia.org/wiki/Read-copy-update) and many other initialization stuff in the next part.
+It is the end of the eighth part about the Linux kernel initialization process. In this part, we looked on the initialization process of the scheduler and we will continue in the next part to dive in the linux kernel initialization process and will see initialization of the [RCU](http://en.wikipedia.org/wiki/Read-copy-update) and many other initialization stuff in the next part.
If you have any questions or suggestions write me a comment or ping me at [twitter](https://twitter.com/0xAX).
diff --git a/Initialization/linux-initialization-9.md b/Initialization/linux-initialization-9.md
index 0684a57a..3ee00077 100644
--- a/Initialization/linux-initialization-9.md
+++ b/Initialization/linux-initialization-9.md
@@ -4,7 +4,7 @@ Kernel initialization. Part 9.
RCU initialization
================================================================================
-This is ninth part of the [Linux Kernel initialization process](https://0xax.gitbook.io/linux-insides/summary/initialization) and in the previous part we stopped at the [scheduler initialization](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-8). In this part we will continue to dive to the linux kernel initialization process and the main purpose of this part will be to learn about initialization of the [RCU](http://en.wikipedia.org/wiki/Read-copy-update). We can see that the next step in the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c) after the `sched_init` is the call of the `preempt_disable`. There are two macros:
+This is ninth part of the [Linux Kernel initialization process](https://0xax.gitbook.io/linux-insides/summary/initialization) and in the previous part we stopped at the [scheduler initialization](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-8). In this part we will continue to dive to the Linux kernel initialization process and the main purpose of this part will be to learn about initialization of the [RCU](http://en.wikipedia.org/wiki/Read-copy-update). We can see that the next step in the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c) after the `sched_init` is the call of the `preempt_disable`. There are two macros:
* `preempt_disable`
* `preempt_enable`
@@ -71,7 +71,7 @@ That's all. Preemption is disabled and we can go ahead.
Initialization of the integer ID management
--------------------------------------------------------------------------------
-In the next step we can see the call of the `idr_init_cache` function which defined in the [lib/idr.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/lib/idr.c). The `idr` library is used in a various [places](http://lxr.free-electrons.com/ident?i=idr_find) in the linux kernel to manage assigning integer `IDs` to objects and looking up objects by id.
+In the next step we can see the call of the `idr_init_cache` function which defined in the [lib/idr.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/lib/idr.c). The `idr` library is used in a various [places](http://lxr.free-electrons.com/ident?i=idr_find) in the Linux kernel to manage assigning integer `IDs` to objects and looking up objects by id.
Let's look on the implementation of the `idr_init_cache` function:
@@ -120,14 +120,14 @@ More about integer ID management you can read [here](https://lwn.net/Articles/10
RCU initialization
--------------------------------------------------------------------------------
-The next step is [RCU](http://en.wikipedia.org/wiki/Read-copy-update) initialization with the `rcu_init` function and it's implementation depends on two kernel configuration options:
+The next step is [RCU](http://en.wikipedia.org/wiki/Read-copy-update) initialization with the `rcu_init` function and its implementation depends on two kernel configuration options:
* `CONFIG_TINY_RCU`
* `CONFIG_TREE_RCU`
In the first case `rcu_init` will be in the [kernel/rcu/tiny.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/rcu/tiny.c) and in the second case it will be defined in the [kernel/rcu/tree.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/rcu/tree.c). We will see the implementation of the `tree rcu`, but first of all about the `RCU` in general.
-`RCU` or read-copy update is a scalable high-performance synchronization mechanism implemented in the Linux kernel. On the early stage the linux kernel provided support and environment for the concurrently running applications, but all execution was serialized in the kernel using a single global lock. In our days linux kernel has no single global lock, but provides different mechanisms including [lock-free data structures](http://en.wikipedia.org/wiki/Concurrent_data_structure), [percpu](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-1) data structures and other. One of these mechanisms is - the `read-copy update`. The `RCU` technique is designed for rarely-modified data structures. The idea of the `RCU` is simple. For example we have a rarely-modified data structure. If somebody wants to change this data structure, we make a copy of this data structure and make all changes in the copy. In the same time all other users of the data structure use old version of it. Next, we need to choose safe moment when original version of the data structure will have no users and update it with the modified copy.
+`RCU` or read-copy update is a scalable high-performance synchronization mechanism implemented in the Linux kernel. On the early stage the Linux kernel provided support and environment for the concurrently running applications, but all execution was serialized in the kernel using a single global lock. In our days linux kernel has no single global lock, but provides different mechanisms including [lock-free data structures](http://en.wikipedia.org/wiki/Concurrent_data_structure), [percpu](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-1) data structures and other. One of these mechanisms is - the `read-copy update`. The `RCU` technique is designed for rarely-modified data structures. The idea of the `RCU` is simple. For example we have a rarely-modified data structure. If somebody wants to change this data structure, we make a copy of this data structure and make all changes in the copy. In the same time all other users of the data structure use old version of it. Next, we need to choose safe moment when original version of the data structure will have no users and update it with the modified copy.
Of course this description of the `RCU` is very simplified. To understand some details about `RCU`, first of all we need to learn some terminology. Data readers in the `RCU` executed in the [critical section](http://en.wikipedia.org/wiki/Critical_section). Every time when data reader get to the critical section, it calls the `rcu_read_lock`, and `rcu_read_unlock` on exit from the critical section. If the thread is not in the critical section, it will be in state which called - `quiescent state`. The moment when every thread is in the `quiescent state` called - `grace period`. If a thread wants to remove an element from the data structure, this occurs in two steps. First step is `removal` - atomically removes element from the data structure, but does not release the physical memory. After this thread-writer announces and waits until it is finished. From this moment, the removed element is available to the thread-readers. After the `grace period` finished, the second step of the element removal will be started, it just removes the element from the physical memory.
@@ -292,7 +292,7 @@ extern struct rcu_state rcu_bh_state;
DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
```
-About this states you can read [here](http://lwn.net/Articles/264090/). As I wrote above we need to initialize `rcu_state` structures and `rcu_init_one` function will help us with it. After the `rcu_state` initialization, we can see the call of the ` __rcu_init_preempt` which depends on the `CONFIG_PREEMPT_RCU` kernel configuration option. It does the same as previous functions - initialization of the `rcu_preempt_state` structure with the `rcu_init_one` function which has `rcu_state` type. After this, in the `rcu_init`, we can see the call of the:
+About these states you can read [here](http://lwn.net/Articles/264090/). As I wrote above we need to initialize `rcu_state` structures and `rcu_init_one` function will help us with it. After the `rcu_state` initialization, we can see the call of the ` __rcu_init_preempt` which depends on the `CONFIG_PREEMPT_RCU` kernel configuration option. It does the same as previous functions - initialization of the `rcu_preempt_state` structure with the `rcu_init_one` function which has `rcu_state` type. After this, in the `rcu_init`, we can see the call of the:
```C
open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
@@ -370,13 +370,13 @@ That's all. We saw initialization process of the `RCU` subsystem. As I wrote abo
Rest of the initialization process
--------------------------------------------------------------------------------
-Ok, we already passed the main theme of this part which is `RCU` initialization, but it is not the end of the linux kernel initialization process. In the last paragraph of this theme we will see a couple of functions which work in the initialization time, but we will not dive into deep details around this function for different reasons. Some reasons not to dive into details are following:
+Ok, we already passed the main theme of this part which is `RCU` initialization, but it is not the end of the Linux kernel initialization process. In the last paragraph of this theme we will see a couple of functions which work in the initialization time, but we will not dive into deep details around this function for different reasons. Some reasons not to dive into details are following:
* They are not very important for the generic kernel initialization process and depend on the different kernel configuration;
* They have the character of debugging and not important for now;
* We will see many of this stuff in the separate parts/chapters.
-After we initialized `RCU`, the next step which you can see in the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c) is the - `trace_init` function. As you can understand from its name, this function initialize [tracing](http://en.wikipedia.org/wiki/Tracing_%28software%29) subsystem. You can read more about linux kernel trace system - [here](http://elinux.org/Kernel_Trace_Systems).
+After we initialized `RCU`, the next step which you can see in the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c) is the - `trace_init` function. As you can understand from its name, this function initialize [tracing](http://en.wikipedia.org/wiki/Tracing_%28software%29) subsystem. You can read more about Linux kernel trace system - [here](http://elinux.org/Kernel_Trace_Systems).
After the `trace_init`, we can see the call of the `radix_tree_init`. If you are familiar with the different data structures, you can understand from the name of this function that it initializes kernel implementation of the [Radix tree](http://en.wikipedia.org/wiki/Radix_tree). This function is defined in the [lib/radix-tree.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/lib/radix-tree.c) and you can read more about it in the part about [Radix tree](https://0xax.gitbook.io/linux-insides/summary/datastructures/linux-datastructures-2).
@@ -405,7 +405,7 @@ This is the end of the ninth part of the [linux kernel initialization process](h
Conclusion
--------------------------------------------------------------------------------
-It is the end of the ninth part about the linux kernel [initialization process](https://0xax.gitbook.io/linux-insides/summary/initialization). In this part, we looked on the initialization process of the `RCU` subsystem. In the next part we will continue to dive into linux kernel initialization process and I hope that we will finish with the `start_kernel` function and will go to the `rest_init` function from the same [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c) source code file and will see the start of the first process.
+It is the end of the ninth part about the Linux kernel [initialization process](https://0xax.gitbook.io/linux-insides/summary/initialization). In this part, we looked on the initialization process of the `RCU` subsystem. In the next part we will continue to dive into linux kernel initialization process and I hope that we will finish with the `start_kernel` function and will go to the `rest_init` function from the same [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c) source code file and will see the start of the first process.
If you have any questions or suggestions write me a comment or ping me at [twitter](https://twitter.com/0xAX).
diff --git a/Interrupts/README.md b/Interrupts/README.md
index 9710c984..e23b566e 100644
--- a/Interrupts/README.md
+++ b/Interrupts/README.md
@@ -1,6 +1,6 @@
# Interrupts and Interrupt Handling
-In the following posts, we will cover interrupts and exceptions handling in the linux kernel.
+In the following posts, we will cover interrupts and exceptions handling in the Linux kernel.
* [Interrupts and Interrupt Handling. Part 1.](linux-interrupts-1.md) - describes interrupts and interrupt handling theory.
* [Interrupts in the Linux Kernel](linux-interrupts-2.md) - describes stuffs related to interrupts and exceptions handling from the early stage.
diff --git a/Interrupts/images/kernel.png b/Interrupts/images/kernel.png
index 93a05238..f5475cc4 100644
Binary files a/Interrupts/images/kernel.png and b/Interrupts/images/kernel.png differ
diff --git a/Interrupts/linux-interrupts-1.md b/Interrupts/linux-interrupts-1.md
index 0c1c8c22..cb37b612 100644
--- a/Interrupts/linux-interrupts-1.md
+++ b/Interrupts/linux-interrupts-1.md
@@ -281,8 +281,7 @@ The `PAGE_SIZE` is `4096`-bytes and the `THREAD_SIZE_ORDER` depends on the `KASA
#define IRQ_STACK_SIZE (PAGE_SIZE << IRQ_STACK_ORDER)
```
-Or `16384` bytes. The per-cpu interrupt stack is represented by the `irq_stack` struct and the `fixed_percpu_data` struct
-in the Linux kernel for `x86_64`:
+Or `16384` bytes. The per-cpu interrupt stack is represented by the `irq_stack` struct and the `fixed_percpu_data` struct in the Linux kernel for `x86_64`:
```C
/* Per CPU interrupt stacks */
@@ -306,7 +305,7 @@ struct fixed_percpu_data {
#endif
```
-The `irq_stack` struct contains a 16 kilobytes array.
+The `irq_stack` struct contains a 16 kilobytes array.
Also, you can see that the fixed\_percpu\_data contains two fields:
* `gs_base` - The `gs` register always points to the bottom of the `fixed_percpu_data`. On the `x86_64`, the `gs` register is shared by per-cpu area and stack canary (more about `per-cpu` variables you can read in the special [part](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-1)). All per-cpu symbols are zero-based and the `gs` points to the base of the per-cpu area. You already know that [segmented memory model](http://en.wikipedia.org/wiki/Memory_segmentation) is abolished in the long mode, but we can set the base address for the two segment registers - `fs` and `gs` with the [Model specific registers](http://en.wikipedia.org/wiki/Model-specific_register) and these registers can be still be used as address registers. If you remember the first [part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-1) of the Linux kernel initialization process, you can remember that we have set the `gs` register:
@@ -373,11 +372,11 @@ int irq_init_percpu_irqstack(unsigned int cpu)
Here we go over all the CPUs one-by-one and setup the `hardirq_stack_ptr`.
Where `map_irq_stack` is called to initialize the `hardirq_stack_ptr`,
-to point onto the `irq_backing_store` of the current CPU with an offset of IRQ\_STACK\_SIZE,
+to point onto the `irq_stack_backing_store` of the current CPU with an offset of IRQ\_STACK\_SIZE,
either with guard pages or without when KASan is enabled.
-After the initialization of the interrupt stack, we need to initialize the gs register within [arch/x86/kernel/cpu/common.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/cpu/common.c):
+After the initialization of the interrupt stack, we need to initialize the gs register within [arch/x86/kernel/cpu/common.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/cpu/common.c):
```C
void load_percpu_segment(int cpu)
@@ -406,7 +405,7 @@ and as we already know the `gs` register points to the bottom of the interrupt s
Here we can see the `wrmsr` instruction, which loads the data from `edx:eax` into the [Model specific register](http://en.wikipedia.org/wiki/Model-specific_register) pointed by the `ecx` register. In our case the model specific register is `MSR_GS_BASE`, which contains the base address of the memory segment pointed to by the `gs` register. `edx:eax` points to the address of the `initial_gs,` which is the base address of our `fixed_percpu_data`.
-We already know that `x86_64` has a feature called `Interrupt Stack Table` or `IST` and this feature provides the ability to switch to a new stack for events like a non-maskable interrupt, double fault etc. There can be up to seven `IST` entries per-cpu. Some of them are:
+We already know that `x86_64` has a feature called `Interrupt Stack Table` or `IST` and this feature provides the ability to switch to a new stack for events like a non-maskable interrupt, double fault, etc. There can be up to seven `IST` entries per-cpu. Some of them are:
* `DOUBLEFAULT_STACK`
* `NMI_STACK`
@@ -433,7 +432,7 @@ static const __initconst struct idt_data def_idts[] = {
INTG(X86_TRAP_DF, double_fault),
```
-where `nmi` and `double_fault` are entry points created at [arch/x86/kernel/entry_64.S](https://github.com/torvalds/linux/blob/master/arch/x86/entry/entry_64.S):
+where `nmi` and `double_fault` are entry points created at [arch/x86/kernel/entry_64.S](https://github.com/torvalds/linux/blob/master/arch/x86/entry/entry_64.S):
```assembly
idtentry double_fault do_double_fault has_error_code=1 paranoid=2 read_cr2=1
diff --git a/Interrupts/linux-interrupts-10.md b/Interrupts/linux-interrupts-10.md
index 1641f56c..4c55d295 100644
--- a/Interrupts/linux-interrupts-10.md
+++ b/Interrupts/linux-interrupts-10.md
@@ -6,7 +6,7 @@ Last part
This is the tenth part of the [chapter](https://0xax.gitbook.io/linux-insides/summary/interrupts) about interrupts and interrupt handling in the Linux kernel and in the previous [part](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-9) we saw a little about deferred interrupts and related concepts like `softirq`, `tasklet` and `workqeue`. In this part we will continue to dive into this theme and now it's time to look at real hardware driver.
-Let's consider serial driver of the [StrongARM** SA-110/21285 Evaluation Board](http://netwinder.osuosl.org/pub/netwinder/docs/intel/datashts/27813501.pdf) board for example and will look how this driver requests an [IRQ](https://en.wikipedia.org/wiki/Interrupt_request_%28PC_architecture%29) line,
+Let's consider serial driver of the [StrongARM** SA-110/21285 Evaluation Board](http://netwinder.osuosl.org/pub/netwinder/docs/intel/datashts/27813501.pdf) board for example and will look how this driver requests an [IRQ](https://en.wikipedia.org/wiki/Interrupt_request_%28PC_architecture%29) line,
what happens when an interrupt is triggered and etc. The source code of this driver is placed in the [drivers/tty/serial/21285.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/drivers/tty/serial/21285.c) source code file. Ok, we have source code, let's start.
Initialization of a kernel module
@@ -111,7 +111,7 @@ if (ret == 0)
return ret;
```
-That's all. Our driver is initialized. When an `uart` port will be opened with the call of the `uart_open` function from the [drivers/tty/serial/serial_core.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/drivers/tty/serial/serial_core.c), it will call the `uart_startup` function to start up the serial port. This function will call the `startup` function that is part of the `uart_ops` structure. Each `uart` driver has the definition of this structure, in our case it is:
+That's all. Our driver is initialized. When an `uart` port is opened with the call of the `uart_open` function from the [drivers/tty/serial/serial_core.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/drivers/tty/serial/serial_core.c), it will call the `uart_startup` function to start up the serial port. This function will call the `startup` function that is part of the `uart_ops` structure. Each `uart` driver has the definition of this structure, in our case it is:
```C
static struct uart_ops serial21285_ops = {
@@ -243,7 +243,7 @@ if (!irq_settings_can_request(desc) || WARN_ON(irq_settings_is_per_cpu_devid(des
return -EINVAL;
```
-and exit with the `-EINVAL`otherways. After this we check the given interrupt handler. If it was not passed to the `request_irq` function, we check the `thread_fn`. If both handlers are `NULL`, we return with the `-EINVAL`. If an interrupt handler was not passed to the `request_irq` function, but the `thread_fn` is not null, we set handler to the `irq_default_primary_handler`:
+and exit with the `-EINVAL` otherwise. After this we check the given interrupt handler. If it was not passed to the `request_irq` function, we check the `thread_fn`. If both handlers are `NULL`, we return with the `-EINVAL`. If an interrupt handler was not passed to the `request_irq` function, but the `thread_fn` is not null, we set handler to the `irq_default_primary_handler`:
```C
if (!handler) {
@@ -296,12 +296,12 @@ if (new->thread_fn && !nested) {
}
```
-And fill the rest of the given interrupt descriptor fields in the end. So, our `16` and `17` interrupt request lines are registered and the `serial21285_rx_chars` and `serial21285_tx_chars` functions will be invoked when an interrupt controller will get event related to these interrupts. Now let's look at what happens when an interrupt occurs.
+And fill the rest of the given interrupt descriptor fields in the end. So, our `16` and `17` interrupt request lines are registered and the `serial21285_rx_chars` and `serial21285_tx_chars` functions will be invoked when an interrupt controller will get event related to these interrupts. Now let's look at what happens when an interrupt occurs.
Prepare to handle an interrupt
--------------------------------------------------------------------------------
-In the previous paragraph we saw the requesting of the irq line for the given interrupt descriptor and registration of the `irqaction` structure for the given interrupt. We already know that when an interrupt event occurs, an interrupt controller notifies the processor about this event and processor tries to find appropriate interrupt gate for this interrupt. If you have read the eighth [part](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-8) of this chapter, you may remember the `native_init_IRQ` function. This function makes initialization of the local [APIC](https://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller). The following part of this function is the most interesting part for us right now:
+In the previous paragraph we saw the requesting of the irq line for the given interrupt descriptor and registration of the `irqaction` structure for the given interrupt. We already know that when an interrupt event occurs, an interrupt controller notifies the processor about this event and processor tries to find appropriate interrupt gate for this interrupt. If you have read the eighth [part](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-8) of this chapter, you may remember the `native_init_IRQ` function. This function makes initialization of the local [APIC](https://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller). The following part of this function is the most interesting part for us right now:
```C
for_each_clear_bit_from(i, used_vectors, first_system_vector) {
@@ -398,7 +398,7 @@ static inline void generic_handle_irq_desc(unsigned int irq, struct irq_desc *de
}
```
-But stop... What is it `handle_irq` and why do we call our interrupt handler from the interrupt descriptor when we know that `irqaction` points to the actual interrupt handler? Actually the `irq_desc->handle_irq` is a high-level API for the calling interrupt handler routine. It setups during initialization of the [device tree](https://en.wikipedia.org/wiki/Device_tree) and [APIC](https://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller) initialization. The kernel selects correct function and call chain of the `irq->action(s)` there. In this way, the `serial21285_tx_chars` or the `serial21285_rx_chars` function will be executed after an interrupt will occur.
+But stop... What is it `handle_irq` and why do we call our interrupt handler from the interrupt descriptor when we know that `irqaction` points to the actual interrupt handler? Actually the `irq_desc->handle_irq` is a high-level API for the calling interrupt handler routine. It is setup during initialization of the [device tree](https://en.wikipedia.org/wiki/Device_tree) and [APIC](https://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller) initialization. The kernel selects correct function and call chain of the `irq->action(s)` there. In this way, the `serial21285_tx_chars` or the `serial21285_rx_chars` function will be executed after an interrupt occurs.
In the end of the `do_IRQ` function we call the `irq_exit` function that will exit from the interrupt context, the `set_irq_regs` with the old userspace registers and return:
@@ -413,7 +413,7 @@ We already know that when an `IRQ` finishes its work, deferred interrupts will b
Exit from interrupt
--------------------------------------------------------------------------------
-Ok, the interrupt handler finished its execution and now we must return from the interrupt. When the work of the `do_IRQ` function will be finished, we will return back to the assembler code in the [arch/x86/entry/entry_64.S](https://github.com/torvalds/linux/blob/master/arch/x86/entry/entry_64.S) to the `ret_from_intr` label. First of all we disable interrupts with the `DISABLE_INTERRUPTS` macro that expands to the `cli` instruction and decreases value of the `irq_count` [per-cpu](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-1) variable. Remember, this variable had value - `1`, when we were in interrupt context:
+Ok, the interrupt handler finished its execution and now we must return from the interrupt. When the work of the `do_IRQ` function is finished, we will return back to the assembler code in the [arch/x86/entry/entry_64.S](https://github.com/torvalds/linux/blob/master/arch/x86/entry/entry_64.S) to the `ret_from_intr` label. First of all we disable interrupts with the `DISABLE_INTERRUPTS` macro that expands to the `cli` instruction and decreases value of the `irq_count` [per-cpu](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-1) variable. Remember, this variable had value - `1`, when we were in interrupt context:
```assembly
DISABLE_INTERRUPTS(CLBR_NONE)
@@ -462,8 +462,8 @@ Links
* [IRQ](https://en.wikipedia.org/wiki/Interrupt_request_%28PC_architecture%29)
* [module](https://en.wikipedia.org/wiki/Loadable_kernel_module)
* [initcall](http://kernelnewbies.org/Documents/InitcallMechanism)
-* [uart](https://en.wikipedia.org/wiki/Universal_asynchronous_receiver/transmitter)
-* [ISA](https://en.wikipedia.org/wiki/Industry_Standard_Architecture)
+* [uart](https://en.wikipedia.org/wiki/Universal_asynchronous_receiver/transmitter)
+* [ISA](https://en.wikipedia.org/wiki/Industry_Standard_Architecture)
* [memory management](https://0xax.gitbook.io/linux-insides/summary/mm)
* [i2c](https://en.wikipedia.org/wiki/I%C2%B2C)
* [APIC](https://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller)
diff --git a/Interrupts/linux-interrupts-2.md b/Interrupts/linux-interrupts-2.md
index fa683c02..ff2b611e 100644
--- a/Interrupts/linux-interrupts-2.md
+++ b/Interrupts/linux-interrupts-2.md
@@ -196,7 +196,7 @@ for (i = 0; i < NUM_EXCEPTION_VECTORS; i++)
load_idt((const struct desc_ptr *)&idt_descr);
```
-AS you can see it has only one difference in the name of the array of the interrupts handlers entry points. Now it is `early_idt_handler_array`:
+As you can see it has only one difference in the name of the array of the interrupts handlers entry points. Now it is `early_idt_handler_array`:
```C
extern const char early_idt_handler_array[NUM_EXCEPTION_VECTORS][EARLY_IDT_HANDLER_SIZE];
@@ -209,7 +209,7 @@ where `NUM_EXCEPTION_VECTORS` and `EARLY_IDT_HANDLER_SIZE` are defined as:
#define EARLY_IDT_HANDLER_SIZE 9
```
-So, the `early_idt_handler_array` is an array of the interrupts handlers entry points and contains one entry point on every nine bytes. You can remember that previous `early_idt_handlers` was defined in the [arch/x86/kernel/head_64.S](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/head_64.S). The `early_idt_handler_array` is defined in the same source code file too:
+So, the `early_idt_handler_array` is an array of the interrupts handlers entry points and contains one entry point on every nine bytes. You can remember that previous `early_idt_handlers` was defined in the [arch/x86/kernel/head_64.S](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/head_64.S). The `early_idt_handler_array` is defined in the same source code file too:
```assembly
ENTRY(early_idt_handler_array)
@@ -357,7 +357,7 @@ $ sudo cat /proc/lockdep
redundant softirq offs: 0
```
-Ok, now we know a little about tracing, but more info will be in the separate part about `lockdep` and `tracing`. You can see that the both `local_disable_irq` macros have the same part - `raw_local_irq_disable`. This macro defined in the [arch/x86/include/asm/irqflags.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/irqflags.h) and expands to the call of the:
+Ok, now we know a little about tracing, but more info will be in the separate part about `lockdep` and `tracing`. You can see that the both `local_irq_disable` macros have the same part - `raw_local_irq_disable`. This macro defined in the [arch/x86/include/asm/irqflags.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/irqflags.h) and expands to the call of the:
```C
static inline void native_irq_disable(void)
@@ -417,7 +417,7 @@ Here we can see calls of three different functions:
* `set_system_intr_gate_ist`
* `set_intr_gate`
-All of these functions defined in the [arch/x86/include/asm/desc.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/desc.h) and do the similar thing but not the same. The first `set_intr_gate_ist` function inserts new an interrupt gate in the `IDT`. Let's look on its implementation:
+All of these functions defined in the [arch/x86/include/asm/desc.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/desc.h) and do the similar thing but not the same. The first `set_intr_gate_ist` function inserts a new interrupt gate in the `IDT`. Let's look on its implementation:
```C
static inline void set_intr_gate_ist(int n, void *addr, unsigned ist)
@@ -441,7 +441,7 @@ static inline void _set_gate(int gate, unsigned type, void *addr,
}
```
-Here we start from the `pack_gate` function which takes clean `IDT` entry represented by the `gate_desc` structure and fills it with the base address and limit, [Interrupt Stack Table](https://www.kernel.org/doc/Documentation/x86/x86_64/kernel-stacks), [Privilege level](http://en.wikipedia.org/wiki/Privilege_level), type of an interrupt which can be one of the following values:
+Here we start from the `pack_gate` function which takes clean `IDT` entry represented by the `gate_desc` structure and fills it with the base address and limit, [Interrupt Stack Table](https://www.kernel.org/doc/Documentation/x86/kernel-stacks), [Privilege level](http://en.wikipedia.org/wiki/Privilege_level), type of an interrupt which can be one of the following values:
* `GATE_INTERRUPT`
* `GATE_TRAP`
@@ -494,7 +494,7 @@ static inline void set_system_intr_gate_ist(int n, void *addr, unsigned ist)
}
```
-Do you see it? Look on the fourth parameter of the `_set_gate`. It is `0x3`. In the `set_intr_gate` it was `0x0`. We know that this parameter represent `DPL` or privilege level. We also know that `0` is the highest privilege level and `3` is the lowest.Now we know how `set_system_intr_gate_ist`, `set_intr_gate_ist`, `set_intr_gate` are work and we can return to the `early_trap_init` function. Let's look on it again:
+Do you see it? Look on the fourth parameter of the `_set_gate`. It is `0x3`. In the `set_intr_gate` it was `0x0`. We know that this parameter represent `DPL` or privilege level. We also know that `0` is the highest privilege level and `3` is the lowest. Now we know how `set_system_intr_gate_ist`, `set_intr_gate_ist`, `set_intr_gate` work and we can return to the `early_trap_init` function. Let's look on it again:
```C
set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK);
@@ -537,6 +537,6 @@ Links
* [Union type](http://en.wikipedia.org/wiki/Union_type)
* [this_cpu_* operations](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/Documentation/this_cpu_ops.txt)
* [vector number](http://en.wikipedia.org/wiki/Interrupt_vector_table)
-* [Interrupt Stack Table](https://www.kernel.org/doc/Documentation/x86/x86_64/kernel-stacks)
+* [Interrupt Stack Table](https://www.kernel.org/doc/Documentation/x86/kernel-stacks)
* [Privilege level](http://en.wikipedia.org/wiki/Privilege_level)
* [Previous part](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-1)
diff --git a/Interrupts/linux-interrupts-3.md b/Interrupts/linux-interrupts-3.md
index 11cb6e19..d358f2a4 100644
--- a/Interrupts/linux-interrupts-3.md
+++ b/Interrupts/linux-interrupts-3.md
@@ -4,9 +4,9 @@ Interrupts and Interrupt Handling. Part 3.
Exception Handling
--------------------------------------------------------------------------------
-This is the third part of the [chapter](https://0xax.gitbook.io/linux-insides/summary/interrupts) about an interrupts and an exceptions handling in the Linux kernel and in the previous [part](https://0xax.gitbook.io/linux-insides/summary/interrupts) we stopped at the `setup_arch` function from the [arch/x86/kernel/setup.c](https://github.com/torvalds/linux/blame/master/arch/x86/kernel/setup.c) source code file.
+This is the third part of the [chapter](https://0xax.gitbook.io/linux-insides/summary/interrupts) about interrupts and an exceptions handling in the Linux kernel and in the previous [part](https://0xax.gitbook.io/linux-insides/summary/interrupts) we stopped at the `setup_arch` function from the [arch/x86/kernel/setup.c](https://github.com/torvalds/linux/blame/master/arch/x86/kernel/setup.c) source code file.
-We already know that this function executes initialization of architecture-specific stuff. In our case the `setup_arch` function does [x86_64](https://en.wikipedia.org/wiki/X86-64) architecture related initializations. The `setup_arch` is big function, and in the previous part we stopped on the setting of the two exceptions handlers for the two following exceptions:
+We already know that this function executes initialization of architecture-specific stuff. In our case the `setup_arch` function does [x86_64](https://en.wikipedia.org/wiki/X86-64) architecture related initializations. The `setup_arch` is big function, and in the previous part we stopped on the setting of the two exception handlers for the two following exceptions:
* `#DB` - debug exception, transfers control from the interrupted process to the debug handler;
* `#BP` - breakpoint exception, caused by the `int 3` instruction.
@@ -24,18 +24,18 @@ void __init early_trap_init(void)
}
```
-from the [arch/x86/kernel/traps.c](https://github.com/torvalds/linux/tree/master/arch/x86/kernel/traps.c). We already saw implementation of the `set_intr_gate_ist` and `set_system_intr_gate_ist` functions in the previous part and now we will look on the implementation of these two exceptions handlers.
+from the [arch/x86/kernel/traps.c](https://github.com/torvalds/linux/tree/master/arch/x86/kernel/traps.c). We already saw implementation of the `set_intr_gate_ist` and `set_system_intr_gate_ist` functions in the previous part and now we will look on the implementation of these two exception handlers.
Debug and Breakpoint exceptions
--------------------------------------------------------------------------------
-Ok, we setup exception handlers in the `early_trap_init` function for the `#DB` and `#BP` exceptions and now time is to consider their implementations. But before we will do this, first of all let's look on details of these exceptions.
+Ok, we setup exception handlers in the `early_trap_init` function for the `#DB` and `#BP` exceptions and now is time to consider their implementations. But before we will do this, first of all let's look on details of these exceptions.
The first exceptions - `#DB` or `debug` exception occurs when a debug event occurs. For example - attempt to change the contents of a [debug register](http://en.wikipedia.org/wiki/X86_debug_register). Debug registers are special registers that were presented in `x86` processors starting from the [Intel 80386](http://en.wikipedia.org/wiki/Intel_80386) processor and as you can understand from name of this CPU extension, main purpose of these registers is debugging.
These registers allow to set breakpoints on the code and read or write data to trace it. Debug registers may be accessed only in the privileged mode and an attempt to read or write the debug registers when executing at any other privilege level causes a [general protection fault](https://en.wikipedia.org/wiki/General_protection_fault) exception. That's why we have used `set_intr_gate_ist` for the `#DB` exception, but not the `set_system_intr_gate_ist`.
-The verctor number of the `#DB` exceptions is `1` (we pass it as `X86_TRAP_DB`) and as we may read in specification, this exception has no error code:
+The vector number of the `#DB` exceptions is `1` (we pass it as `X86_TRAP_DB`) and as we may read in specification, this exception has no error code:
```
+-----------------------------------------------------+
@@ -65,6 +65,7 @@ If we will compile and run this program, we will see following output:
```
$ gcc breakpoint.c -o breakpoint
+$ ./breakpoint
i equal to: 0
Trace/breakpoint trap
```
@@ -77,7 +78,7 @@ $ gdb breakpoint
...
...
(gdb) run
-Starting program: /home/alex/breakpoints
+Starting program: /home/alex/breakpoints
i equal to: 0
Program received signal SIGTRAP, Trace/breakpoint trap.
@@ -112,7 +113,7 @@ As you may note before, the `set_intr_gate_ist` and `set_system_intr_gate_ist` f
* `debug`;
* `int3`.
-You will not find these functions in the C code. all of that could be found in the kernel's `*.c/*.h` files only definition of these functions which are located in the [arch/x86/include/asm/traps.h](https://github.com/torvalds/linux/tree/master/arch/x86/include/asm/traps.h) kernel header file:
+You will not find these functions in the C code. All of that could be found in the kernel's `*.c/*.h` files only definition of these functions which are located in the [arch/x86/include/asm/traps.h](https://github.com/torvalds/linux/tree/master/arch/x86/include/asm/traps.h) kernel header file:
```C
asmlinkage void debug(void);
@@ -138,7 +139,7 @@ and
idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK
```
-Each exception handler may be consists from two parts. The first part is generic part and it is the same for all exception handlers. An exception handler should to save [general purpose registers](https://en.wikipedia.org/wiki/Processor_register) on the stack, switch to kernel stack if an exception came from userspace and transfer control to the second part of an exception handler. The second part of an exception handler does certain work depends on certain exception. For example page fault exception handler should find virtual page for given address, invalid opcode exception handler should send `SIGILL` [signal](https://en.wikipedia.org/wiki/Unix_signal) and etc.
+Each exception handler may consists of two parts. The first part is generic part and it is the same for all exception handlers. An exception handler should to save [general purpose registers](https://en.wikipedia.org/wiki/Processor_register) on the stack, switch to kernel stack if an exception came from userspace and transfer control to the second part of an exception handler. The second part of an exception handler does certain work depends on certain exception. For example page fault exception handler should find virtual page for given address, invalid opcode exception handler should send `SIGILL` [signal](https://en.wikipedia.org/wiki/Unix_signal) and etc.
As we just saw, an exception handler starts from definition of the `idtentry` macro from the [arch/x86/entry/entry_64.S](https://github.com/torvalds/linux/blob/master/arch/x86/entry/entry_64.S) assembly source code file, so let's look at implementation of this macro. As we may see, the `idtentry` macro takes five arguments:
@@ -193,7 +194,7 @@ If we will look at these definitions, we may know that compiler will generate tw
But it is not only fake error-code. Moreover the `-1` also represents invalid system call number, so that the system call restart logic will not be triggered.
-The last two parameters of the `idtentry` macro `shift_ist` and `paranoid` allow to know do an exception handler runned at stack from `Interrupt Stack Table` or not. You already may know that each kernel thread in the system has own stack. In addition to these stacks, there are some specialized stacks associated with each processor in the system. One of these stacks is - exception stack. The [x86_64](https://en.wikipedia.org/wiki/X86-64) architecture provides special feature which is called - `Interrupt Stack Table`. This feature allows to switch to a new stack for designated events such as an atomic exceptions like `double fault` and etc. So the `shift_ist` parameter allows us to know do we need to switch on `IST` stack for an exception handler or not.
+The last two parameters of the `idtentry` macro `shift_ist` and `paranoid` allow to know do an exception handler runned at stack from `Interrupt Stack Table` or not. You already may know that each kernel thread in the system has its own stack. In addition to these stacks, there are some specialized stacks associated with each processor in the system. One of these stacks is - exception stack. The [x86_64](https://en.wikipedia.org/wiki/X86-64) architecture provides special feature which is called - `Interrupt Stack Table`. This feature allows to switch to a new stack for designated events such as an atomic exceptions like `double fault`, etc. So the `shift_ist` parameter allows us to know do we need to switch on `IST` stack for an exception handler or not.
The second parameter - `paranoid` defines the method which helps us to know did we come from userspace or not to an exception handler. The easiest way to determine this is to via `CPL` or `Current Privilege Level` in `CS` segment register. If it is equal to `3`, we came from userspace, if zero we came from kernel space:
@@ -213,7 +214,7 @@ But unfortunately this method does not give a 100% guarantee. As described in th
> stack but before we executed SWAPGS, then the only safe way to check
> for GS is the slower method: the RDMSR.
-In other words for example `NMI` could happen inside the critical section of a [swapgs](http://www.felixcloutier.com/x86/SWAPGS.html) instruction. In this way we should check value of the `MSR_GS_BASE` [model specific register](https://en.wikipedia.org/wiki/Model-specific_register) which stores pointer to the start of per-cpu area. So to check did we come from userspace or not, we should to check value of the `MSR_GS_BASE` model specific register and if it is negative we came from kernel space, in other way we came from userspace:
+In other words for example `NMI` could happen inside the critical section of a [swapgs](http://www.felixcloutier.com/x86/SWAPGS.html) instruction. In this way we should check value of the `MSR_GS_BASE` [model specific register](https://en.wikipedia.org/wiki/Model-specific_register) which stores pointer to the start of per-cpu area. So to check if we did come from userspace or not, we should to check value of the `MSR_GS_BASE` model specific register and if it is negative we came from kernel space, in other way we came from userspace:
```assembly
movl $MSR_GS_BASE,%ecx
@@ -224,7 +225,7 @@ js 1f
In first two lines of code we read value of the `MSR_GS_BASE` model specific register into `edx:eax` pair. We can't set negative value to the `gs` from userspace. But from other side we know that direct mapping of the physical memory starts from the `0xffff880000000000` virtual address. In this way, `MSR_GS_BASE` will contain an address from `0xffff880000000000` to `0xffffc7ffffffffff`. After the `rdmsr` instruction will be executed, the smallest possible value in the `%edx` register will be - `0xffff8800` which is `-30720` in unsigned 4 bytes. That's why kernel space `gs` which points to start of `per-cpu` area will contain negative value.
-After we pushed fake error code on the stack, we should allocate space for general purpose registers with:
+After we push fake error code on the stack, we should allocate space for general purpose registers with:
```assembly
ALLOC_PT_GPREGS_ON_STACK
@@ -370,7 +371,7 @@ asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs)
}
```
-This function takes the result of the `task_ptr_regs` macro which is defined in the [arch/x86/include/asm/processor.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/processor.h) header file, stores it in the stack pointer and return it. The `task_ptr_regs` macro expands to the address of `thread.sp0` which represents pointer to the normal kernel stack:
+This function takes the result of the `task_ptr_regs` macro which is defined in the [arch/x86/include/asm/processor.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/processor.h) header file, stores it in the stack pointer and returns it. The `task_ptr_regs` macro expands to the address of `thread.sp0` which represents pointer to the normal kernel stack:
```C
#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1)
@@ -403,7 +404,7 @@ as it will be passed as first parameter of secondary exception handler.
.endif
```
-Additionally you may see that we zeroed the `%esi` register above in a case if an exception does not provide error code.
+Additionally you may see that we zeroed the `%esi` register above in a case if an exception does not provide error code.
In the end we just call secondary exception handler:
@@ -423,7 +424,7 @@ will be for `debug` exception and:
dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code);
```
-will be for `int 3` exception. In this part we will not see implementations of secondary handlers, because of they are very specific, but will see some of them in one of next parts.
+will be for `int 3` exception. In this part we will not see implementations of secondary handlers, because they are very specific, but will see some of them in one of next parts.
We just considered first case when an exception occurred in userspace. Let's consider last two.
@@ -461,7 +462,7 @@ movq %rsp, %rdi
.endif
```
-The last step before a secondary handler of an exception will be called is cleanup of new `IST` stack fram:
+The last step before a secondary handler of an exception will be called is cleanup of new `IST` stack frame:
```assembly
.if \shift_ist != -1
diff --git a/Interrupts/linux-interrupts-4.md b/Interrupts/linux-interrupts-4.md
index 22a37a82..0aacd77c 100644
--- a/Interrupts/linux-interrupts-4.md
+++ b/Interrupts/linux-interrupts-4.md
@@ -99,7 +99,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
}
```
-This register contains a linear address which caused `page fault`. In the next step we make a call of the `exception_enter` function from the [include/linux/context_tracking.h](https://github.com/torvalds/linux/blob/master/include/linux/context_tracking.h). The `exception_enter` and `exception_exit` are functions from context tracking subsystem in the Linux kernel used by the [RCU](https://en.wikipedia.org/wiki/Read-copy-update) to remove its dependency on the timer tick while a processor runs in userspace. Almost in the every exception handler we will see similar code:
+This register contains a linear address which caused `page fault`. In the next step we make a call of the `exception_enter` function from the [include/linux/context_tracking.h](https://github.com/torvalds/linux/blob/master/include/linux/context_tracking.h). The `exception_enter` and `exception_exit` are functions from context tracking subsystem in the Linux kernel used by the [RCU](https://en.wikipedia.org/wiki/Read-copy-update) to remove its dependency on the timer tick while a processor runs in userspace. Almost in every exception handler we will see similar code:
```C
enum ctx_state prev_state;
@@ -182,7 +182,7 @@ or `0x00007ffffffff000`. Pay attention on `unlikely` macro. There are two macros
#define unlikely(x) __builtin_expect(!!(x), 0)
```
-You can [often](http://lxr.free-electrons.com/ident?i=unlikely) find these macros in the code of the Linux kernel. Main purpose of these macros is optimization. Sometimes this situation is that we need to check the condition of the code and we know that it will rarely be `true` or `false`. With these macros we can tell to the compiler about this. For example
+You can [often](http://lxr.free-electrons.com/ident?i=unlikely) find these macros in the code of the Linux kernel. Main purpose of these macros is optimization. Sometimes this situation is that we need to check the condition of the code and we know that it will rarely be `true` or `false`. With these macros we can tell to the compiler about this. For example
```C
static int proc_root_readdir(struct file *file, struct dir_context *ctx)
@@ -447,7 +447,7 @@ Links
* [prefetchw](http://www.felixcloutier.com/x86/PREFETCHW.html)
* [3DNow](https://en.wikipedia.org/?title=3DNow!)
* [CPU caches](https://en.wikipedia.org/wiki/CPU_cache)
-* [VFS](https://en.wikipedia.org/wiki/Virtual_file_system)
+* [VFS](https://en.wikipedia.org/wiki/Virtual_file_system)
* [Linux kernel memory management](https://0xax.gitbook.io/linux-insides/summary/mm)
* [Fix-Mapped Addresses and ioremap](https://0xax.gitbook.io/linux-insides/summary/mm/linux-mm-2)
* [Extended Industry Standard Architecture](https://en.wikipedia.org/wiki/Extended_Industry_Standard_Architecture)
diff --git a/Interrupts/linux-interrupts-5.md b/Interrupts/linux-interrupts-5.md
index 1a0826ca..55f54fab 100644
--- a/Interrupts/linux-interrupts-5.md
+++ b/Interrupts/linux-interrupts-5.md
@@ -7,21 +7,21 @@ Implementation of exception handlers
This is the fifth part about an interrupts and exceptions handling in the Linux kernel and in the previous [part](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-4) we stopped on the setting of interrupt gates to the [Interrupt descriptor Table](https://en.wikipedia.org/wiki/Interrupt_descriptor_table). We did it in the `trap_init` function from the [arch/x86/kernel/traps.c](https://github.com/torvalds/linux/tree/master/arch/x86/kernel/traps.c) source code file. We saw only setting of these interrupt gates in the previous part and in the current part we will see implementation of the exception handlers for these gates. The preparation before an exception handler will be executed is in the [arch/x86/entry/entry_64.S](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/entry/entry_64.S) assembly file and occurs in the [idtentry](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/entry/entry_64.S#L820) macro that defines exceptions entry points:
```assembly
-idtentry divide_error do_divide_error has_error_code=0
-idtentry overflow do_overflow has_error_code=0
-idtentry invalid_op do_invalid_op has_error_code=0
-idtentry bounds do_bounds has_error_code=0
-idtentry device_not_available do_device_not_available has_error_code=0
-idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0
-idtentry invalid_TSS do_invalid_TSS has_error_code=1
-idtentry segment_not_present do_segment_not_present has_error_code=1
-idtentry spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0
-idtentry coprocessor_error do_coprocessor_error has_error_code=0
-idtentry alignment_check do_alignment_check has_error_code=1
-idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0
-```
-
-The `idtentry` macro does following preparation before an actual exception handler (`do_divide_error` for the `divide_error`, `do_overflow` for the `overflow` and etc.) will get control. In another words the `idtentry` macro allocates place for the registers ([pt_regs](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/uapi/asm/ptrace.h#L43) structure) on the stack, pushes dummy error code for the stack consistency if an interrupt/exception has no error code, checks the segment selector in the `cs` segment register and switches depends on the previous state(userspace or kernelspace). After all of these preparations it makes a call of an actual interrupt/exception handler:
+idtentry divide_error do_divide_error has_error_code=0
+idtentry overflow do_overflow has_error_code=0
+idtentry invalid_op do_invalid_op has_error_code=0
+idtentry bounds do_bounds has_error_code=0
+idtentry device_not_available do_device_not_available has_error_code=0
+idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0
+idtentry invalid_TSS do_invalid_TSS has_error_code=1
+idtentry segment_not_present do_segment_not_present has_error_code=1
+idtentry spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0
+idtentry coprocessor_error do_coprocessor_error has_error_code=0
+idtentry alignment_check do_alignment_check has_error_code=1
+idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0
+```
+
+The `idtentry` macro does following preparation before an actual exception handler (`do_divide_error` for the `divide_error`, `do_overflow` for the `overflow`, etc.) will get control. In another words the `idtentry` macro allocates place for the registers ([pt_regs](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/uapi/asm/ptrace.h#L43) structure) on the stack, pushes dummy error code for the stack consistency if an interrupt/exception has no error code, checks the segment selector in the `cs` segment register and switches depends on the previous state (userspace or kernelspace). After all of these preparations it makes a call to an actual interrupt/exception handler:
```assembly
.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
@@ -84,7 +84,7 @@ DO_ERROR(X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS)
DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present)
DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment)
DO_ERROR(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check)
-```
+```
As we can see the `DO_ERROR` macro takes 4 parameters:
@@ -112,7 +112,7 @@ dotraplinkage void do_divide_error(struct pt_regs *regs, long error_code) \
}
```
-We can see that all functions which are generated by the `DO_ERROR` macro just make a call of the `do_error_trap` function from the [arch/x86/kernel/traps.c](https://github.com/torvalds/linux/tree/master/arch/x86/kernel/traps.c). Let's look on implementation of the `do_error_trap` function.
+We can see that all functions which are generated by the `DO_ERROR` macro just make a call to the `do_error_trap` function from the [arch/x86/kernel/traps.c](https://github.com/torvalds/linux/tree/master/arch/x86/kernel/traps.c). Let's look on implementation of the `do_error_trap` function.
Trap handlers
--------------------------------------------------------------------------------
@@ -150,7 +150,7 @@ enum ctx_state {
} state;
```
-The second function is `exception_exit` defined in the same [include/linux/context_tracking.h](https://github.com/torvalds/linux/tree/master/include/linux/context_tracking.h) file and checks that context tracking is enabled and call the `contert_tracking_enter` function if the previous context was `user`:
+The second function is `exception_exit` defined in the same [include/linux/context_tracking.h](https://github.com/torvalds/linux/tree/master/include/linux/context_tracking.h) file and checks that context tracking is enabled and call the `context_tracking_enter` function if the previous context was `user`:
```C
static inline void exception_exit(enum ctx_state prev_ctx)
@@ -173,7 +173,7 @@ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) !=
}
```
-First of all it calls the `notify_die` function which defined in the [kernel/notifier.c](https://github.com/torvalds/linux/tree/master/kernel/notifier.c). To get notified for [kernel panic](https://en.wikipedia.org/wiki/Kernel_panic), [kernel oops](https://en.wikipedia.org/wiki/Linux_kernel_oops), [Non-Maskable Interrupt](https://en.wikipedia.org/wiki/Non-maskable_interrupt) or other events the caller needs to insert itself in the `notify_die` chain and the `notify_die` function does it. The Linux kernel has special mechanism that allows kernel to ask when something happens and this mechanism called `notifiers` or `notifier chains`. This mechanism used for example for the `USB` hotplug events (look on the [drivers/usb/core/notify.c](https://github.com/torvalds/linux/tree/master/drivers/usb/core/notify.c)), for the memory [hotplug](https://en.wikipedia.org/wiki/Hot_swapping) (look on the [include/linux/memory.h](https://github.com/torvalds/linux/tree/master/include/linux/memory.h), the `hotplug_memory_notifier` macro and etc...), system reboots and etc. A notifier chain is thus a simple, singly-linked list. When a Linux kernel subsystem wants to be notified of specific events, it fills out a special `notifier_block` structure and passes it to the `notifier_chain_register` function. An event can be sent with the call of the `notifier_call_chain` function. First of all the `notify_die` function fills `die_args` structure with the trap number, trap string, registers and other values:
+First of all it calls the `notify_die` function which defined in the [kernel/notifier.c](https://github.com/torvalds/linux/tree/master/kernel/notifier.c). To get notified for [kernel panic](https://en.wikipedia.org/wiki/Kernel_panic), [kernel oops](https://en.wikipedia.org/wiki/Linux_kernel_oops), [Non-Maskable Interrupt](https://en.wikipedia.org/wiki/Non-maskable_interrupt) or other events the caller needs to insert itself in the `notify_die` chain and the `notify_die` function does it. The Linux kernel has special mechanism that allows kernel to ask when something happens and this mechanism called `notifiers` or `notifier chains`. This mechanism used for example for the `USB` hotplug events (look on the [drivers/usb/core/notify.c](https://github.com/torvalds/linux/tree/master/drivers/usb/core/notify.c)), for the memory [hotplug](https://en.wikipedia.org/wiki/Hot_swapping) (look on the [include/linux/memory.h](https://github.com/torvalds/linux/tree/master/include/linux/memory.h), the `hotplug_memory_notifier` macro, etc...), system reboots, etc. A notifier chain is thus a simple, singly-linked list. When a Linux kernel subsystem wants to be notified of specific events, it fills out a special `notifier_block` structure and passes it to the `notifier_chain_register` function. An event can be sent with the call of the `notifier_call_chain` function. First of all the `notify_die` function fills `die_args` structure with the trap number, trap string, registers and other values:
```C
struct die_args args = {
@@ -247,7 +247,7 @@ if (!fixup_exception(regs)) {
}
```
-The `die` function defined in the [arch/x86/kernel/dumpstack.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/dumpstack.c) source code file, prints useful information about stack, registers, kernel modules and caused kernel [oops](https://en.wikipedia.org/wiki/Linux_kernel_oops). If we came from the userspace the `do_trap_no_signal` function will return `-1` and the execution of the `do_trap` function will continue. If we passed through the `do_trap_no_signal` function and did not exit from the `do_trap` after this, it means that previous context was - `user`. Most exceptions caused by the processor are interpreted by Linux as error conditions, for example division by zero, invalid opcode and etc. When an exception occurs the Linux kernel sends a [signal](https://en.wikipedia.org/wiki/Unix_signal) to the interrupted process that caused the exception to notify it of an incorrect condition. So, in the `do_trap` function we need to send a signal with the given number (`SIGFPE` for the divide error, `SIGILL` for a illegal instruction and etc...). First of all we save error code and vector number in the current interrupts process with the filling `thread.error_code` and `thread_trap_nr`:
+The `die` function defined in the [arch/x86/kernel/dumpstack.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/dumpstack.c) source code file, prints useful information about stack, registers, kernel modules and caused kernel [oops](https://en.wikipedia.org/wiki/Linux_kernel_oops). If we came from the userspace the `do_trap_no_signal` function will return `-1` and the execution of the `do_trap` function will continue. If we passed through the `do_trap_no_signal` function and did not exit from the `do_trap` after this, it means that previous context was - `user`. Most exceptions caused by the processor are interpreted by Linux as error conditions, for example division by zero, invalid opcode, etc. When an exception occurs the Linux kernel sends a [signal](https://en.wikipedia.org/wiki/Unix_signal) to the interrupted process that caused the exception to notify it of an incorrect condition. So, in the `do_trap` function we need to send a signal with the given number (`SIGFPE` for the divide error, `SIGILL` for a illegal instruction, etc.). First of all we save error code and vector number in the current interrupts process with the filling `thread.error_code` and `thread_trap_nr`:
```C
tsk->thread.error_code = error_code;
@@ -275,7 +275,7 @@ And send a given signal to interrupted process:
force_sig_info(signr, info ?: SEND_SIG_PRIV, tsk);
```
-This is the end of the `do_trap`. We just saw generic implementation for eight different exceptions which are defined with the `DO_ERROR` macro. Now let's look on another exception handlers.
+This is the end of the `do_trap`. We just saw generic implementation for eight different exceptions which are defined with the `DO_ERROR` macro. Now let's look at other exception handlers.
Double fault
--------------------------------------------------------------------------------
@@ -286,7 +286,7 @@ The next exception is `#DF` or `Double fault`. This exception occurs when the pr
set_intr_gate_ist(X86_TRAP_DF, &double_fault, DOUBLEFAULT_STACK);
```
-Note that this exception runs on the `DOUBLEFAULT_STACK` [Interrupt Stack Table](https://www.kernel.org/doc/Documentation/x86/x86_64/kernel-stacks) which has index - `1`:
+Note that this exception runs on the `DOUBLEFAULT_STACK` [Interrupt Stack Table](https://www.kernel.org/doc/Documentation/x86/kernel-stacks) which has index - `1`:
```C
#define DOUBLEFAULT_STACK 1
@@ -485,7 +485,7 @@ Links
* [printk](https://en.wikipedia.org/wiki/Printk)
* [coprocessor](https://en.wikipedia.org/wiki/Coprocessor)
* [SIMD](https://en.wikipedia.org/wiki/SIMD)
-* [Interrupt Stack Table](https://www.kernel.org/doc/Documentation/x86/x86_64/kernel-stacks)
+* [Interrupt Stack Table](https://www.kernel.org/doc/Documentation/x86/kernel-stacks)
* [PID](https://en.wikipedia.org/wiki/Process_identifier)
* [x87 FPU](https://en.wikipedia.org/wiki/X87)
* [control register](https://en.wikipedia.org/wiki/Control_register)
diff --git a/Interrupts/linux-interrupts-6.md b/Interrupts/linux-interrupts-6.md
index 33838072..4a3ecac1 100644
--- a/Interrupts/linux-interrupts-6.md
+++ b/Interrupts/linux-interrupts-6.md
@@ -4,7 +4,7 @@ Interrupts and Interrupt Handling. Part 6.
Non-maskable interrupt handler
--------------------------------------------------------------------------------
-It is sixth part of the [Interrupts and Interrupt Handling in the Linux kernel](https://0xax.gitbook.io/linux-insides/summary/interrupts) chapter and in the previous [part](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-5) we saw implementation of some exception handlers for the [General Protection Fault](https://en.wikipedia.org/wiki/General_protection_fault) exception, divide exception, invalid [opcode](https://en.wikipedia.org/wiki/Opcode) exceptions and etc. As I wrote in the previous part we will see implementations of the rest exceptions in this part. We will see implementation of the following handlers:
+It is sixth part of the [Interrupts and Interrupt Handling in the Linux kernel](https://0xax.gitbook.io/linux-insides/summary/interrupts) chapter and in the previous [part](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-5) we saw implementation of some exception handlers for the [General Protection Fault](https://en.wikipedia.org/wiki/General_protection_fault) exception, divide exception, invalid [opcode](https://en.wikipedia.org/wiki/Opcode) exceptions, etc. As I wrote in the previous part we will see implementations of the rest exceptions in this part. We will see implementation of the following handlers:
* [Non-Maskable](https://en.wikipedia.org/wiki/Non-maskable_interrupt) interrupt;
* [BOUND](http://pdos.csail.mit.edu/6.828/2005/readings/i386/BOUND.htm) Range Exceeded Exception;
@@ -169,7 +169,7 @@ pushq $-1
ALLOC_PT_GPREGS_ON_STACK
```
-We already saw implementation of the `ALLOC_PT_GREGS_ON_STACK` macro in the third part of the interrupts [chapter](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-3). This macro defined in the [arch/x86/entry/calling.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/entry/calling.h) and yet another allocates `120` bytes on stack for the general purpose registers, from the `rdi` to the `r15`:
+We already saw implementation of the `ALLOC_PT_GPREGS_ON_STACK` macro in the third part of the interrupts [chapter](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-3). This macro defined in the [arch/x86/entry/calling.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/entry/calling.h) and yet another allocates `120` bytes on stack for the general purpose registers, from the `rdi` to the `r15`:
```assembly
.macro ALLOC_PT_GPREGS_ON_STACK addskip=0
@@ -261,7 +261,7 @@ Now let's look on the `do_nmi` exception handler. This function defined in the [
* error code.
as all exception handlers. The `do_nmi` starts from the call of the `nmi_nesting_preprocess` function and ends with the call of the `nmi_nesting_postprocess`. The `nmi_nesting_preprocess` function checks that we likely do not work with the debug stack and if we on the debug stack set the `update_debug_stack` [per-cpu](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-1) variable to `1` and call the `debug_stack_set_zero` function from the [arch/x86/kernel/cpu/common.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/cpu/common.c). This function increases the `debug_stack_use_ctr` per-cpu variable and loads new `Interrupt Descriptor Table`:
-
+
```C
static inline void nmi_nesting_preprocess(struct pt_regs *regs)
{
@@ -362,7 +362,7 @@ After all of this, there is still only one way when `MPX` is responsible for thi
Coprocessor exception and SIMD exception
--------------------------------------------------------------------------------
-The next two exceptions are [x87 FPU](https://en.wikipedia.org/wiki/X87) Floating-Point Error exception or `#MF` and [SIMD](https://en.wikipedia.org/wiki/SIMD) Floating-Point Exception or `#XF`. The first exception occurs when the `x87 FPU` has detected floating point error. For example divide by zero, numeric overflow and etc. The second exception occurs when the processor has detected [SSE/SSE2/SSE3](https://en.wikipedia.org/wiki/SSE3) `SIMD` floating-point exception. It can be the same as for the `x87 FPU`. The handlers for these exceptions are `do_coprocessor_error` and `do_simd_coprocessor_error` are defined in the [arch/x86/kernel/traps.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/traps.c) and very similar on each other. They both make a call of the `math_error` function from the same source code file but pass different vector number. The `do_coprocessor_error` passes `X86_TRAP_MF` vector number to the `math_error`:
+The next two exceptions are [x87 FPU](https://en.wikipedia.org/wiki/X87) Floating-Point Error exception or `#MF` and [SIMD](https://en.wikipedia.org/wiki/SIMD) Floating-Point Exception or `#XF`. The first exception occurs when the `x87 FPU` has detected floating point error. For example divide by zero, numeric overflow, etc. The second exception occurs when the processor has detected [SSE/SSE2/SSE3](https://en.wikipedia.org/wiki/SSE3) `SIMD` floating-point exception. It can be the same as for the `x87 FPU`. The handlers for these exceptions are `do_coprocessor_error` and `do_simd_coprocessor_error` are defined in the [arch/x86/kernel/traps.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/traps.c) and very similar on each other. They both make a call of the `math_error` function from the same source code file but pass different vector number. The `do_coprocessor_error` passes `X86_TRAP_MF` vector number to the `math_error`:
```C
dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
@@ -389,7 +389,7 @@ do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
}
```
-First of all the `math_error` function defines current interrupted task, address of its fpu, string which describes an exception, add it to the `notify_die` chain and return from the exception handler if it will return `NOTIFY_STOP`:
+First of all the `math_error` function defines current interrupted task, address of its FPU, string which describes an exception, add it to the `notify_die` chain and return from the exception handler if it will return `NOTIFY_STOP`:
```C
struct task_struct *task = current;
@@ -434,7 +434,7 @@ After this we check the signal code and if it is non-zero we return:
if (!info.si_code)
return;
```
-
+
Or send the `SIGFPE` signal in the end:
```C
@@ -446,7 +446,7 @@ That's all.
Conclusion
--------------------------------------------------------------------------------
-It is the end of the sixth part of the [Interrupts and Interrupt Handling](https://0xax.gitbook.io/linux-insides/summary/interrupts) chapter and we saw implementation of some exception handlers in this part, like `non-maskable` interrupt, [SIMD](https://en.wikipedia.org/wiki/SIMD) and [x87 FPU](https://en.wikipedia.org/wiki/X87) floating point exception. Finally we have finished with the `trap_init` function in this part and will go ahead in the next part. The next our point is the external interrupts and the `early_irq_init` function from the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c).
+It is the end of the sixth part of the [Interrupts and Interrupt Handling](https://0xax.gitbook.io/linux-insides/summary/interrupts) chapter and we saw implementation of some exception handlers in this part, like `non-maskable` interrupt, [SIMD](https://en.wikipedia.org/wiki/SIMD) and [x87 FPU](https://en.wikipedia.org/wiki/X87) floating point exception. Finally, we finished with the `trap_init` function in this part and will go ahead in the next part. The next our point is the external interrupts and the `early_irq_init` function from the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c).
If you have any questions or suggestions write me a comment or ping me at [twitter](https://twitter.com/0xAX).
@@ -457,7 +457,7 @@ Links
* [General Protection Fault](https://en.wikipedia.org/wiki/General_protection_fault)
* [opcode](https://en.wikipedia.org/wiki/Opcode)
-* [Non-Maskable](https://en.wikipedia.org/wiki/Non-maskable_interrupt)
+* [Non-Maskable](https://en.wikipedia.org/wiki/Non-maskable_interrupt)
* [BOUND instruction](http://pdos.csail.mit.edu/6.828/2005/readings/i386/BOUND.htm)
* [CPU socket](https://en.wikipedia.org/wiki/CPU_socket)
* [Interrupt Descriptor Table](https://en.wikipedia.org/wiki/Interrupt_descriptor_table)
@@ -474,7 +474,7 @@ Links
* [stack frame](https://en.wikipedia.org/wiki/Call_stack)
* [Model Specific register](https://en.wikipedia.org/wiki/Model-specific_register)
* [percpu](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-1)
-* [RCU](https://en.wikipedia.org/wiki/Read-copy-update)
+* [RCU](https://en.wikipedia.org/wiki/Read-copy-update)
* [MPX](https://en.wikipedia.org/wiki/Intel_MPX)
* [x87 FPU](https://en.wikipedia.org/wiki/X87)
* [Previous part](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-5)
diff --git a/Interrupts/linux-interrupts-7.md b/Interrupts/linux-interrupts-7.md
index 4f96fd34..d96401b5 100644
--- a/Interrupts/linux-interrupts-7.md
+++ b/Interrupts/linux-interrupts-7.md
@@ -4,9 +4,9 @@ Interrupts and Interrupt Handling. Part 7.
Introduction to external interrupts
--------------------------------------------------------------------------------
-This is the seventh part of the Interrupts and Interrupt Handling in the Linux kernel [chapter](https://0xax.gitbook.io/linux-insides/summary/interrupts) and in the previous [part](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-6) we have finished with the exceptions which are generated by the processor. In this part we will continue to dive to the interrupt handling and will start with the external hardware interrupt handling. As you can remember, in the previous part we have finished with the `trap_init` function from the [arch/x86/kernel/trap.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/traps.c) and the next step is the call of the `early_irq_init` function from the [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c).
+This is the seventh part of the Interrupts and Interrupt Handling in the Linux kernel [chapter](https://0xax.gitbook.io/linux-insides/summary/interrupts) and in the previous [part](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-6) we have finished with the exceptions which are generated by the processor. In this part we will continue to dive to the interrupt handling and will start with the external hardware interrupt handling. As you can remember, in the previous part we have finished with the `trap_init` function from the [arch/x86/kernel/trap.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/traps.c) and the next step is the call of the `early_irq_init` function from [init/main.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/init/main.c).
-Interrupts are signal that are sent across [IRQ](https://en.wikipedia.org/wiki/Interrupt_request_%28PC_architecture%29) or `Interrupt Request Line` by a hardware or software. External hardware interrupts allow devices like keyboard, mouse and etc, to indicate that it needs attention of the processor. Once the processor receives the `Interrupt Request`, it will temporary stop execution of the running program and invoke special routine which depends on an interrupt. We already know that this routine is called interrupt handler (or how we will call it `ISR` or `Interrupt Service Routine` from this part). The `ISR` or `Interrupt Handler Routine` can be found in Interrupt Vector table that is located at fixed address in the memory. After the interrupt is handled processor resumes the interrupted process. At the boot/initialization time, the Linux kernel identifies all devices in the machine, and appropriate interrupt handlers are loaded into the interrupt table. As we saw in the previous parts, most exceptions are handled simply by the sending a [Unix signal](https://en.wikipedia.org/wiki/Unix_signal) to the interrupted process. That's why kernel is can handle an exception quickly. Unfortunately we can not use this approach for the external hardware interrupts, because often they arrive after (and sometimes long after) the process to which they are related has been suspended. So it would make no sense to send a Unix signal to the current process. External interrupt handling depends on the type of an interrupt:
+Interrupts are signal that are sent across [IRQ](https://en.wikipedia.org/wiki/Interrupt_request_%28PC_architecture%29) or `Interrupt Request Line` by a hardware or software. External hardware interrupts allow devices like keyboard, mouse and etc, to indicate that it needs attention of the processor. Once the processor receives the `Interrupt Request`, it will temporary stop execution of the running program and invoke special routine which depends on an interrupt. We already know that this routine is called interrupt handler (or how we will call it `ISR` or `Interrupt Service Routine` from this part). The `ISR` or `Interrupt Handler Routine` can be found in Interrupt Vector table that is located at fixed address in the memory. After the interrupt is handled processor resumes the interrupted process. At the boot/initialization time, the Linux kernel identifies all devices in the machine, and appropriate interrupt handlers are loaded into the interrupt table. As we saw in the previous parts, most exceptions are handled simply by the sending a [Unix signal](https://en.wikipedia.org/wiki/Unix_signal) to the interrupted process. That's how the kernel can handle an exception quickly. Unfortunately we can not use this approach for the external hardware interrupts, because often they arrive after (and sometimes long after) the process to which they are related has been suspended. So it would make no sense to send a Unix signal to the current process. External interrupt handling depends on the type of an interrupt:
* `I/O` interrupts;
* Timer interrupts;
@@ -21,7 +21,7 @@ Generally, a handler of an `I/O` interrupt must be flexible enough to service se
* Execute the interrupt service routine (next we will call it `ISR`) which is associated with the device;
* Restore registers and return from an interrupt;
-Ok, we know a little theory and now let's start with the `early_irq_init` function. The implementation of the `early_irq_init` function is in the [kernel/irq/irqdesc.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/irq/irqdesc.c). This function make early initialization of the `irq_desc` structure. The `irq_desc` structure is the foundation of interrupt management code in the Linux kernel. An array of this structure, which has the same name - `irq_desc`, keeps track of every interrupt request source in the Linux kernel. This structure defined in the [include/linux/irqdesc.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/irqdesc.h) and as you can note it depends on the `CONFIG_SPARSE_IRQ` kernel configuration option. This kernel configuration option enables support for sparse irqs. The `irq_desc` structure contains many different files:
+Ok, we know a little theory and now let's start with the `early_irq_init` function. The implementation of the `early_irq_init` function is in the [kernel/irq/irqdesc.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/irq/irqdesc.c). This function make early initialization of the `irq_desc` structure. The `irq_desc` structure is the foundation of interrupt management code in the Linux kernel. An array of this structure, which has the same name - `irq_desc`, keeps track of every interrupt request source in the Linux kernel. This structure defined in the [include/linux/irqdesc.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/irqdesc.h) and as you can note it depends on the `CONFIG_SPARSE_IRQ` kernel configuration option. This kernel configuration option enables support for sparse IRQs. The `irq_desc` structure contains many different fields:
* `irq_common_data` - per irq and chip data passed down to chip functions;
* `status_use_accessors` - contains status of the interrupt source which is combination of the values from the `enum` from the [include/linux/irq.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/irq.h) and different macros which are defined in the same source code file;
@@ -29,7 +29,7 @@ Ok, we know a little theory and now let's start with the `early_irq_init` functi
* `handle_irq` - highlevel irq-events handler;
* `action` - identifies the interrupt service routines to be invoked when the [IRQ](https://en.wikipedia.org/wiki/Interrupt_request_%28PC_architecture%29) occurs;
* `irq_count` - counter of interrupt occurrences on the IRQ line;
-* `depth` - `0` if the IRQ line is enabled and a positive value if it has been disabled at least once;
+* `depth` - `0` if the IRQ line is enabled and a positive value if it has been disabled at least once;
* `last_unhandled` - aging timer for unhandled count;
* `irqs_unhandled` - count of the unhandled interrupts;
* `lock` - a spin lock used to serialize the accesses to the `IRQ` descriptor;
@@ -75,10 +75,10 @@ As I already wrote, implementation of the `first_online_node` macro depends on t
#if MAX_NUMNODES > 1
#define first_online_node first_node(node_states[N_ONLINE])
#else
- #define first_online_node 0
+ #define first_online_node 0
```
-The `node_states` is the [enum](https://en.wikipedia.org/wiki/Enumerated_type) which defined in the [include/linux/nodemask.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/nodemask.h) and represent the set of the states of a node. In our case we are searching an online node and it will be `0` if `MAX_NUMNODES` is one or zero. If the `MAX_NUMNODES` is greater than one, the `node_states[N_ONLINE]` will return `1` and the `first_node` macro will be expands to the call of the `__first_node` function which will return `minimal` or the first online node:
+The `node_states` is the [enum](https://en.wikipedia.org/wiki/Enumerated_type) which defined in the [include/linux/nodemask.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/nodemask.h) and represent the set of the states of a node. In our case we are searching an online node and it will be `0` if `MAX_NUMNODES` is one or zero. If the `MAX_NUMNODES` is greater than one, the `node_states[N_ONLINE]` will return `1` and the `first_node` macro will be expanded to the call of the `__first_node` function which will return `minimal` or the first online node:
```C
#define first_node(src) __first_node(&(src))
@@ -113,7 +113,7 @@ static void __init init_irq_default_affinity(void)
#endif
```
-We know that when a hardware, such as disk controller or keyboard, needs attention from the processor, it throws an interrupt. The interrupt tells to the processor that something has happened and that the processor should interrupt current process and handle an incoming event. In order to prevent multiple devices from sending the same interrupts, the [IRQ](https://en.wikipedia.org/wiki/Interrupt_request_%28PC_architecture%29) system was established where each device in a computer system is assigned its own special IRQ so that its interrupts are unique. Linux kernel can assign certain `IRQs` to specific processors. This is known as `SMP IRQ affinity`, and it allows you control how your system will respond to various hardware events (that's why it has certain implementation only if the `CONFIG_SMP` kernel configuration option is set). After we allocated `irq_default_affinity` cpumask, we can see `printk` output:
+We know that when a hardware, such as disk controller or keyboard, needs attention from the processor, it throws an interrupt. The interrupt tells to the processor that something has happened and that the processor should interrupt current process and handle an incoming event. In order to prevent multiple devices from sending the same interrupts, the [IRQ](https://en.wikipedia.org/wiki/Interrupt_request_%28PC_architecture%29) system was established where each device in a computer system is assigned its own special IRQ so that its interrupts are unique. Linux kernel can assign certain `IRQs` to specific processors. This is known as `SMP IRQ affinity`, and it allows you to control how your system will respond to various hardware events (that's why it has certain implementation only if the `CONFIG_SMP` kernel configuration option is set). After we allocated `irq_default_affinity` cpumask, we can see `printk` output:
```C
printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS);
@@ -189,7 +189,7 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
The `irq_desc` is array of the `irq` descriptors. It has three already initialized fields:
-* `handle_irq` - as I already wrote above, this field is the highlevel irq-event handler. In our case it initialized with the `handle_bad_irq` function that defined in the [kernel/irq/handle.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/irq/handle.c) source code file and handles spurious and unhandled irqs;
+* `handle_irq` - as I already wrote above, this field is the highlevel irq-event handler. In our case it initialized with the `handle_bad_irq` function that defined in the [kernel/irq/handle.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/irq/handle.c) source code file and handles spurious and unhandled IRQs;
* `depth` - `0` if the IRQ line is enabled and a positive value if it has been disabled at least once;
* `lock` - A spin lock used to serialize the accesses to the `IRQ` descriptor.
@@ -258,7 +258,7 @@ irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED);
...
```
-In the next step we set the high level interrupt handlers to the `handle_bad_irq` which handles spurious and unhandled irqs (as the hardware stuff is not initialized yet, we set this handler), set `irq_desc.desc` to `1` which means that an `IRQ` is disabled, reset count of the unhandled interrupts and interrupts in general:
+In the next step we set the high level interrupt handlers to the `handle_bad_irq` which handles spurious and unhandled IRQs (as the hardware stuff is not initialized yet, we set this handler), set `irq_desc.desc` to `1` which means that an `IRQ` is disabled, reset count of the unhandled interrupts and interrupts in general:
```C
...
@@ -294,14 +294,14 @@ static void desc_smp_init(struct irq_desc *desc, int node)
#endif
}
```
-
+
In the end of the `early_irq_init` function we return the return value of the `arch_early_irq_init` function:
```C
return arch_early_irq_init();
```
-This function defined in the [kernel/apic/vector.c](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/apic/vector.c) and contains only one call of the `arch_early_ioapic_init` function from the [kernel/apic/io_apic.c](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/apic/io_apic.c). As we can understand from the `arch_early_ioapic_init` function's name, this function makes early initialization of the [I/O APIC](https://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller). First of all it make a check of the number of the legacy interrupts with the call of the `nr_legacy_irqs` function. If we have no legacy interrupts with the [Intel 8259](https://en.wikipedia.org/wiki/Intel_8259) programmable interrupt controller we set `io_apic_irqs` to the `0xffffffffffffffff`:
+This function defined in the [kernel/apic/vector.c](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/apic/vector.c) and contains only one call of the `arch_early_ioapic_init` function from the [kernel/apic/io_apic.c](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/apic/io_apic.c). As we can understand from the `arch_early_ioapic_init` function's name, this function makes early initialization of the [I/O APIC](https://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller). First of all it make a check of the number of the legacy interrupts with the call of the `nr_legacy_irqs` function. If we have no legacy interrupts with the [Intel 8259](https://en.wikipedia.org/wiki/Intel_8259) programmable interrupt controller we set `io_apic_irqs` to the `0xffffffffffffffff`:
```C
if (!nr_legacy_irqs())
@@ -315,7 +315,7 @@ for_each_ioapic(i)
alloc_ioapic_saved_registers(i);
```
-And in the end of the `arch_early_ioapic_init` function we are going through the all legacy irqs (from `IRQ0` to `IRQ15`) in the loop and allocate space for the `irq_cfg` which represents configuration of an irq on the given `NUMA` node:
+And in the end of the `arch_early_ioapic_init` function we are going through the all legacy IRQs (from `IRQ0` to `IRQ15`) in the loop and allocate space for the `irq_cfg` which represents configuration of an irq on the given `NUMA` node:
```C
for (i = 0; i < nr_legacy_irqs(); i++) {
@@ -330,7 +330,7 @@ That's all.
Sparse IRQs
--------------------------------------------------------------------------------
-We already saw in the beginning of this part that implementation of the `early_irq_init` function depends on the `CONFIG_SPARSE_IRQ` kernel configuration option. Previously we saw implementation of the `early_irq_init` function when the `CONFIG_SPARSE_IRQ` configuration option is not set, now let's look on the its implementation when this option is set. Implementation of this function very similar, but little differ. We can see the same definition of variables and call of the `init_irq_default_affinity` in the beginning of the `early_irq_init` function:
+We already saw in the beginning of this part that implementation of the `early_irq_init` function depends on the `CONFIG_SPARSE_IRQ` kernel configuration option. Previously we saw implementation of the `early_irq_init` function when the `CONFIG_SPARSE_IRQ` configuration option is not set, now let's look at its implementation when this option is set. Implementation of this function very similar, but little differ. We can see the same definition of variables and call of the `init_irq_default_affinity` in the beginning of the `early_irq_init` function:
```C
#ifdef CONFIG_SPARSE_IRQ
@@ -356,7 +356,7 @@ But after this we can see the following call:
initcnt = arch_probe_nr_irqs();
```
-The `arch_probe_nr_irqs` function defined in the [arch/x86/kernel/apic/vector.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/apic/vector.c) and calculates count of the pre-allocated irqs and update `nr_irqs` with its number. But stop. Why there are pre-allocated irqs? There is alternative form of interrupts called - [Message Signaled Interrupts](https://en.wikipedia.org/wiki/Message_Signaled_Interrupts) available in the [PCI](https://en.wikipedia.org/wiki/Conventional_PCI). Instead of assigning a fixed number of the interrupt request, the device is allowed to record a message at a particular address of RAM, in fact, the display on the [Local APIC](https://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller#Integrated_local_APICs). `MSI` permits a device to allocate `1`, `2`, `4`, `8`, `16` or `32` interrupts and `MSI-X` permits a device to allocate up to `2048` interrupts. Now we know that irqs can be pre-allocated. More about `MSI` will be in a next part, but now let's look on the `arch_probe_nr_irqs` function. We can see the check which assign amount of the interrupt vectors for the each processor in the system to the `nr_irqs` if it is greater and calculate the `nr` which represents number of `MSI` interrupts:
+The `arch_probe_nr_irqs` function defined in the [arch/x86/kernel/apic/vector.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/apic/vector.c) and calculates count of the pre-allocated IRQs and update `nr_irqs` with this number. But stop. Why are there pre-allocated IRQs? There is alternative form of interrupts called - [Message Signaled Interrupts](https://en.wikipedia.org/wiki/Message_Signaled_Interrupts) available in the [PCI](https://en.wikipedia.org/wiki/Conventional_PCI). Instead of assigning a fixed number of the interrupt request, the device is allowed to record a message at a particular address of RAM, in fact, the display on the [Local APIC](https://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller#Integrated_local_APICs). `MSI` permits a device to allocate `1`, `2`, `4`, `8`, `16` or `32` interrupts and `MSI-X` permits a device to allocate up to `2048` interrupts. Now we know that IRQs can be pre-allocated. More about `MSI` will be in a next part, but now let's look on the `arch_probe_nr_irqs` function. We can see the check which assign amount of the interrupt vectors for the each processor in the system to the `nr_irqs` if it is greater and calculate the `nr` which represents number of `MSI` interrupts:
```C
int nr_irqs = NR_IRQS;
@@ -367,7 +367,7 @@ if (nr_irqs > (NR_VECTORS * nr_cpu_ids))
nr = (gsi_top + nr_legacy_irqs()) + 8 * nr_cpu_ids;
```
-Take a look on the `gsi_top` variable. Each `APIC` is identified with its own `ID` and with the offset where its `IRQ` starts. It is called `GSI` base or `Global System Interrupt` base. So the `gsi_top` represents it. We get the `Global System Interrupt` base from the [MultiProcessor Configuration Table](https://en.wikipedia.org/wiki/MultiProcessor_Specification) table (you can remember that we have parsed this table in the sixth [part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-6) of the Linux Kernel initialization process chapter).
+Take a look on the `gsi_top` variable. Each `APIC` is identified with its own `ID` and with the offset where its `IRQ` starts. It is called `GSI` base or `Global System Interrupt` base. So the `gsi_top` represents it. We get the `Global System Interrupt` base from the [MultiProcessor Configuration Table](https://en.wikipedia.org/wiki/MultiProcessor_Specification) table (you can remember that we have parsed this table in the sixth [part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-6) of the Linux kernel initialization process chapter).
After this we update the `nr` depends on the value of the `gsi_top`:
@@ -380,7 +380,7 @@ After this we update the `nr` depends on the value of the `gsi_top`:
#endif
```
-Update the `nr_irqs` if it less than `nr` and return the number of the legacy irqs:
+Update the `nr_irqs` if it less than `nr` and return the number of the legacy IRQs:
```C
if (nr < nr_irqs)
diff --git a/Interrupts/linux-interrupts-8.md b/Interrupts/linux-interrupts-8.md
index 74888f83..7adc926a 100644
--- a/Interrupts/linux-interrupts-8.md
+++ b/Interrupts/linux-interrupts-8.md
@@ -113,10 +113,10 @@ In the end of the `init_IRQ` function we can see the call of the following funct
x86_init.irqs.intr_init();
```
-from the [arch/x86/kernel/x86_init.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/x86_init.c) source code file. If you have read [chapter](https://0xax.gitbook.io/linux-insides/summary/initialization) about the Linux kernel initialization process, you can remember the `x86_init` structure. This structure contains a couple of files which are points to the function related to the platform setup (`x86_64` in our case), for example `resources` - related with the memory resources, `mpparse` - related with the parsing of the [MultiProcessor Configuration Table](https://en.wikipedia.org/wiki/MultiProcessor_Specification) table and etc.). As we can see the `x86_init` also contains the `irqs` field which contains three following fields:
+from the [arch/x86/kernel/x86_init.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/x86_init.c) source code file. If you have read [chapter](https://0xax.gitbook.io/linux-insides/summary/initialization) about the Linux kernel initialization process, you can remember the `x86_init` structure. This structure contains a couple of files which point to the function related to the platform setup (`x86_64` in our case), for example `resources` - related with the memory resources, `mpparse` - related with the parsing of the [MultiProcessor Configuration Table](https://en.wikipedia.org/wiki/MultiProcessor_Specification) table, etc.). As we can see the `x86_init` also contains the `irqs` field which contains the three following fields:
```C
-struct x86_init_ops x86_init __initdata
+struct x86_init_ops x86_init __initdata
{
...
...
@@ -132,7 +132,7 @@ struct x86_init_ops x86_init __initdata
}
```
-Now, we are interesting in the `native_init_IRQ`. As we can note, the name of the `native_init_IRQ` function contains the `native_` prefix which means that this function is architecture-specific. It defined in the [arch/x86/kernel/irqinit.c](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/irqinit.c) and executes general initialization of the [Local APIC](https://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller#Integrated_local_APICs) and initialization of the [ISA](https://en.wikipedia.org/wiki/Industry_Standard_Architecture) irqs. Let's look on the implementation of the `native_init_IRQ` function and will try to understand what occurs there. The `native_init_IRQ` function starts from the execution of the following function:
+Now, we are interesting in the `native_init_IRQ`. As we can note, the name of the `native_init_IRQ` function contains the `native_` prefix which means that this function is architecture-specific. It defined in the [arch/x86/kernel/irqinit.c](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/irqinit.c) and executes general initialization of the [Local APIC](https://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller#Integrated_local_APICs) and initialization of the [ISA](https://en.wikipedia.org/wiki/Industry_Standard_Architecture) irqs. Let's look at the implementation of the `native_init_IRQ` function and try to understand what occurs there. The `native_init_IRQ` function starts from the execution of the following function:
```C
x86_init.irqs.pre_vector_init();
@@ -155,21 +155,21 @@ The `irq_chip` structure defined in the [include/linux/irq.h](https://github.com
```C
$ cat /proc/interrupts
- CPU0 CPU1 CPU2 CPU3 CPU4 CPU5 CPU6 CPU7
+ CPU0 CPU1 CPU2 CPU3 CPU4 CPU5 CPU6 CPU7
0: 16 0 0 0 0 0 0 0 IO-APIC 2-edge timer
1: 2 0 0 0 0 0 0 0 IO-APIC 1-edge i8042
8: 1 0 0 0 0 0 0 0 IO-APIC 8-edge rtc0
```
-look on the last column;
+look at the last column;
* `(*irq_mask)(struct irq_data *data)` - mask an interrupt source;
* `(*irq_ack)(struct irq_data *data)` - start of a new interrupt;
* `(*irq_startup)(struct irq_data *data)` - start up the interrupt;
* `(*irq_shutdown)(struct irq_data *data)` - shutdown the interrupt
-* and etc.
+* etc.
-fields. Note that the `irq_data` structure represents set of the per irq chip data passed down to chip functions. It contains `mask` - precomputed bitmask for accessing the chip registers, `irq` - interrupt number, `hwirq` - hardware interrupt number, local to the interrupt domain chip low level interrupt hardware access and etc.
+fields. Note that the `irq_data` structure represents set of the per irq chip data passed down to chip functions. It contains `mask` - precomputed bitmask for accessing the chip registers, `irq` - interrupt number, `hwirq` - hardware interrupt number, local to the interrupt domain chip low level interrupt hardware access, etc.
After this depends on the `CONFIG_X86_64` and `CONFIG_X86_LOCAL_APIC` kernel configuration option call the `init_bsp_APIC` function from the [arch/x86/kernel/apic/apic.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/apic/apic.c):
@@ -186,7 +186,7 @@ if (smp_found_config || !cpu_has_apic)
return;
```
-In other way we return from this function. In the next step we call the `clear_local_APIC` function from the same source code file that shutdowns the local `APIC` (more about it will be in the chapter about the `Advanced Programmable Interrupt Controller`) and enable `APIC` of the first processor by the setting `unsigned int value` to the `APIC_SPIV_APIC_ENABLED`:
+Otherwise, we return from this function. In the next step we call the `clear_local_APIC` function from the same source code file that shuts down the local `APIC` (more on it in the `Advanced Programmable Interrupt Controller` chapter) and enable `APIC` of the first processor by the setting `unsigned int value` to the `APIC_SPIV_APIC_ENABLED`:
```C
value = apic_read(APIC_SPIV);
@@ -200,11 +200,11 @@ and writing it with the help of the `apic_write` function:
apic_write(APIC_SPIV, value);
```
-After we have enabled `APIC` for the bootstrap processor, we return to the `init_ISA_irqs` function and in the next step we initialize legacy `Programmable Interrupt Controller` and set the legacy chip and handler for the each legacy irq:
+After we have enabled `APIC` for the bootstrap processor, we return to the `init_ISA_irqs` function and in the next step we initialize legacy `Programmable Interrupt Controller` and set the legacy chip and handler for each legacy irq:
```C
legacy_pic->init(0);
-
+
for (i = 0; i < nr_legacy_irqs(); i++)
irq_set_chip_and_handler(i, chip, handle_level_irq);
```
@@ -229,7 +229,7 @@ struct legacy_pic default_legacy_pic = {
}
```
-The `init_8259A` function defined in the same source code file and executes initialization of the [Intel 8259](https://en.wikipedia.org/wiki/Intel_8259) ``Programmable Interrupt Controller` (more about it will be in the separate chapter about `Programmable Interrupt Controllers` and `APIC`).
+The `init_8259A` function defined in the same source code file and executes initialization of the [Intel 8259](https://en.wikipedia.org/wiki/Intel_8259) `Programmable Interrupt Controller` (more about it will be in the separate chapter about `Programmable Interrupt Controllers` and `APIC`).
Now we can return to the `native_init_IRQ` function, after the `init_ISA_irqs` function finished its work. The next step is the call of the `apic_intr_init` function that allocates special interrupt gates which are used by the [SMP](https://en.wikipedia.org/wiki/Symmetric_multiprocessing) architecture for the [Inter-processor interrupt](https://en.wikipedia.org/wiki/Inter-processor_interrupt). The `alloc_intr_gate` macro from the [arch/x86/include/asm/desc.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/desc.h) used for the interrupt descriptor allocation:
@@ -253,7 +253,7 @@ if (!test_bit(vector, used_vectors)) {
}
```
-We already saw the `set_bit` macro, now let's look on the `test_bit` and the `first_system_vector`. The first `test_bit` macro defined in the [arch/x86/include/asm/bitops.h](https://github.com/torvalds/linux/blob/master/arch/x86/include/asm/bitops.h) and looks like this:
+We already saw the `set_bit` macro, now let's look at the `test_bit` and the `first_system_vector`. The first `test_bit` macro defined in the [arch/x86/include/asm/bitops.h](https://github.com/torvalds/linux/blob/master/arch/x86/include/asm/bitops.h) and looks like this:
```C
#define test_bit(nr, addr) \
@@ -262,7 +262,7 @@ We already saw the `set_bit` macro, now let's look on the `test_bit` and the `fi
: variable_test_bit((nr), (addr)))
```
-We can see the [ternary operator](https://en.wikipedia.org/wiki/Ternary_operation) here make a test with the [gcc](https://en.wikipedia.org/wiki/GNU_Compiler_Collection) built-in function `__builtin_constant_p` tests that given vector number (`nr`) is known at compile time. If you're feeling misunderstanding of the `__builtin_constant_p`, we can make simple test:
+We can see the [ternary operator](https://en.wikipedia.org/wiki/Ternary_operation) here makes a test with the [gcc](https://en.wikipedia.org/wiki/GNU_Compiler_Collection) built-in function `__builtin_constant_p` tests that given vector number (`nr`) is known at compile time. If you're feeling misunderstanding of the `__builtin_constant_p`, we can make simple test:
```C
#include
@@ -279,7 +279,7 @@ int main() {
}
```
-and look on the result:
+and look at the result:
```
$ gcc test.c -o test
@@ -289,7 +289,7 @@ __builtin_constant_p(PREDEFINED_VAL) is 1
__builtin_constant_p(100) is 1
```
-Now I think it must be clear for you. Let's get back to the `test_bit` macro. If the `__builtin_constant_p` will return non-zero, we call `constant_test_bit` function:
+Now I think it must be clear for you. Let's get back to the `test_bit` macro. If the `__builtin_constant_p` returns non-zero, we call `constant_test_bit` function:
```C
static inline int constant_test_bit(int nr, const void *addr)
@@ -313,7 +313,7 @@ static inline int variable_test_bit(int nr, const void *addr)
}
```
-What's the difference between two these functions and why do we need in two different functions for the same purpose? As you already can guess main purpose is optimization. If we will write simple example with these functions:
+What's the difference between two these functions and why do we need in two different functions for the same purpose? As you already can guess main purpose is optimization. If we write simple example with these functions:
```C
#define CONST 25
@@ -326,7 +326,7 @@ int main() {
}
```
-and will look on the assembly output of our example we will see following assembly code:
+and will look at the assembly output of our example we will see following assembly code:
```assembly
pushq %rbp
@@ -351,10 +351,10 @@ movl %eax, %edi
call variable_test_bit
```
-for the `variable_test_bit`. These two code listings starts with the same part, first of all we save base of the current stack frame in the `%rbp` register. But after this code for both examples is different. In the first example we put `$268435456` (here the `$268435456` is our second parameter - `0x10000000`) to the `esi` and `$25` (our first parameter) to the `edi` register and call `constant_test_bit`. We put function parameters to the `esi` and `edi` registers because as we are learning Linux kernel for the `x86_64` architecture we use `System V AMD64 ABI` [calling convention](https://en.wikipedia.org/wiki/X86_calling_conventions). All is pretty simple. When we are using predefined constant, the compiler can just substitute its value. Now let's look on the second part. As you can see here, the compiler can not substitute value from the `nr` variable. In this case compiler must calculate its offset on the program's [stack frame](https://en.wikipedia.org/wiki/Call_stack). We subtract `16` from the `rsp` register to allocate stack for the local variables data and put the `$24` (value of the `nr` variable) to the `rbp` with offset `-4`. Our stack frame will be like this:
+for the `variable_test_bit`. These two code listings starts with the same part, first of all we save base of the current stack frame in the `%rbp` register. But after this code for both examples is different. In the first example we put `$268435456` (here the `$268435456` is our second parameter - `0x10000000`) to the `esi` and `$25` (our first parameter) to the `edi` register and call `constant_test_bit`. We put function parameters to the `esi` and `edi` registers because as we are learning Linux kernel for the `x86_64` architecture we use `System V AMD64 ABI` [calling convention](https://en.wikipedia.org/wiki/X86_calling_conventions). All is pretty simple. When we are using predefined constant, the compiler can just substitute its value. Now let's look at the second part. As you can see here, the compiler can not substitute value from the `nr` variable. In this case compiler must calculate its offset on the program's [stack frame](https://en.wikipedia.org/wiki/Call_stack). We subtract `16` from the `rsp` register to allocate stack for the local variables data and put the `$24` (value of the `nr` variable) to the `rbp` with offset `-4`. Our stack frame will be like this:
```
- <- stack grows
+ <- stack grows
%[rbp]
|
@@ -367,7 +367,7 @@ for the `variable_test_bit`. These two code listings starts with the same part,
%[rsp]
```
-After this we put this value to the `eax`, so `eax` register now contains value of the `nr`. In the end we do the same that in the first example, we put the `$268435456` (the first parameter of the `variable_test_bit` function) and the value of the `eax` (value of `nr`) to the `edi` register (the second parameter of the `variable_test_bit function`).
+After this we put this value to the `eax`, so `eax` register now contains value of the `nr`. In the end we do the same that in the first example, we put the `$268435456` (the first parameter of the `variable_test_bit` function) and the value of the `eax` (value of `nr`) to the `edi` register (the second parameter of the `variable_test_bit function`).
The next step after the `apic_intr_init` function will finish its work is the setting interrupt gates from the `FIRST_EXTERNAL_VECTOR` or `0x20` up to `0x100`:
@@ -408,7 +408,7 @@ if (!acpi_ioapic && !of_ioapic && nr_legacy_irqs())
setup_irq(2, &irq2);
```
-First of all let's deal with the condition. The `acpi_ioapic` variable represents existence of [I/O APIC](https://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller#I.2FO_APICs). It defined in the [arch/x86/kernel/acpi/boot.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/acpi/boot.c). This variable set in the `acpi_set_irq_model_ioapic` function that called during the processing `Multiple APIC Description Table`. This occurs during initialization of the architecture-specific stuff in the [arch/x86/kernel/setup.c](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/setup.c) (more about it we will know in the other chapter about [APIC](https://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller)). Note that the value of the `acpi_ioapic` variable depends on the `CONFIG_ACPI` and `CONFIG_X86_LOCAL_APIC` Linux kernel configuration options. If these options did not set, this variable will be just zero:
+First of all let's deal with the condition. The `acpi_ioapic` variable represents existence of [I/O APIC](https://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller#I.2FO_APICs). It defined in the [arch/x86/kernel/acpi/boot.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/acpi/boot.c). This variable set in the `acpi_set_irq_model_ioapic` function that called during the processing `Multiple APIC Description Table`. This occurs during initialization of the architecture-specific stuff in the [arch/x86/kernel/setup.c](https://github.com/torvalds/linux/blob/master/arch/x86/kernel/setup.c) (more about it we will know in the other chapter about [APIC](https://en.wikipedia.org/wiki/Advanced_Programmable_Interrupt_Controller)). Note that the value of the `acpi_ioapic` variable depends on the `CONFIG_ACPI` and `CONFIG_X86_LOCAL_APIC` Linux kernel configuration options. If these options were not set, this variable will be just zero:
```C
#define acpi_ioapic 0
@@ -430,7 +430,7 @@ extern int of_ioapic;
#endif
```
-If the condition will return non-zero value we call the:
+If the condition returns non-zero value we call the:
```C
setup_irq(2, &irq2);
@@ -465,7 +465,7 @@ Some time ago interrupt controller consisted of two chips and one was connected
* `IRQ 6` - drive controller;
* `IRQ 7` - `LPT1`.
-The `setup_irq` function defined in the [kernel/irq/manage.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/irq/manage.c) and takes two parameters:
+The `setup_irq` function is defined in the [kernel/irq/manage.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/irq/manage.c) and takes two parameters:
* vector number of an interrupt;
* `irqaction` structure related with an interrupt.
@@ -476,7 +476,7 @@ This function initializes interrupt descriptor from the given vector number at t
struct irq_desc *desc = irq_to_desc(irq);
```
-And call the `__setup_irq` function that setups given interrupt:
+And call the `__setup_irq` function that sets up given interrupt:
```C
chip_bus_lock(desc);
@@ -485,7 +485,7 @@ chip_bus_sync_unlock(desc);
return retval;
```
-Note that the interrupt descriptor is locked during `__setup_irq` function will work. The `__setup_irq` function makes many different things: It creates a handler thread when a thread function is supplied and the interrupt does not nest into another interrupt thread, sets the flags of the chip, fills the `irqaction` structure and many many more.
+Note that the interrupt descriptor is locked during `__setup_irq` function will work. The `__setup_irq` function does many different things: it creates a handler thread when a thread function is supplied and the interrupt does not nest into another interrupt thread, sets the flags of the chip, fills the `irqaction` structure and many many more.
All of the above it creates `/prov/vector_number` directory and fills it, but if you are using modern computer all values will be zero there:
@@ -493,16 +493,16 @@ All of the above it creates `/prov/vector_number` directory and fills it, but if
$ cat /proc/irq/2/node
0
-$cat /proc/irq/2/affinity_hint
+$cat /proc/irq/2/affinity_hint
00
-cat /proc/irq/2/spurious
+cat /proc/irq/2/spurious
count 0
unhandled 0
last_unhandled 0 ms
```
-because probably `APIC` handles interrupts on the our machine.
+because probably `APIC` handles interrupts on the machine.
That's all.
diff --git a/Interrupts/linux-interrupts-9.md b/Interrupts/linux-interrupts-9.md
index c2881c02..d2350f66 100644
--- a/Interrupts/linux-interrupts-9.md
+++ b/Interrupts/linux-interrupts-9.md
@@ -16,7 +16,7 @@ As you can understand, it is almost impossible to make so that both characterist
* Top half;
* Bottom half;
-In the past there was one way to defer interrupt handling in Linux kernel. And it was called: `the bottom half` of the processor, but now it is already not actual. Now this term has remained as a common noun referring to all the different ways of organizing deferred processing of an interrupt.The deferred processing of an interrupt suggests that some of the actions for an interrupt may be postponed to a later execution when the system will be less loaded. As you can suggest, an interrupt handler can do large amount of work that is impermissible as it executes in the context where interrupts are disabled. That's why processing of an interrupt can be split on two different parts. In the first part, the main handler of an interrupt does only minimal and the most important job. After this it schedules the second part and finishes its work. When the system is less busy and context of the processor allows to handle interrupts, the second part starts its work and finishes to process remaining part of a deferred interrupt.
+In the past there was one way to defer interrupt handling in Linux kernel. And it was called: `the bottom half` of the processor, but now it is already not actual. Now this term has remained as a common noun referring to all the different ways of organizing deferred processing of an interrupt.The deferred processing of an interrupt suggests that some of the actions for an interrupt may be postponed to a later execution when the system will be less loaded. As you can suggest, an interrupt handler can do large amount of work that is impermissible as it executes in the context where interrupts are disabled. That's why processing of an interrupt can be split in two different parts. In the first part, the main handler of an interrupt does only minimal and the most important job. After this it schedules the second part and finishes its work. When the system is less busy and context of the processor allows to handle interrupts, the second part starts its work and finishes to process remaining part of a deferred interrupt.
There are three types of `deferred interrupts` in the Linux kernel:
@@ -24,7 +24,7 @@ There are three types of `deferred interrupts` in the Linux kernel:
* `tasklets`;
* `workqueues`;
-And we will see description of all of these types in this part. As I said, we saw only a little bit about this theme, so, now is time to dive deep into details about this theme.
+And we will see a description of all of these types in this part. As I said, we saw only a little bit about this theme, so, now is time to dive deep into details about this theme.
Softirqs
----------------------------------------------------------------------------------
@@ -43,7 +43,7 @@ $ systemd-cgls -k | grep ksoft
├─ 43 [ksoftirqd/7]
```
-The `spawn_ksoftirqd` function starts this these threads. As we can see this function called as early [initcall](https://kernelnewbies.org/Documents/InitcallMechanism):
+The `spawn_ksoftirqd` function starts these threads. As we can see this function called as early [initcall](https://kernelnewbies.org/Documents/InitcallMechanism):
```C
early_initcall(spawn_ksoftirqd);
@@ -101,8 +101,8 @@ const char * const softirq_to_name[NR_SOFTIRQS] = {
Or we can see it in the output of the `/proc/softirqs`:
```
-~$ cat /proc/softirqs
- CPU0 CPU1 CPU2 CPU3 CPU4 CPU5 CPU6 CPU7
+~$ cat /proc/softirqs
+ CPU0 CPU1 CPU2 CPU3 CPU4 CPU5 CPU6 CPU7
HI: 5 0 0 0 0 0 0 0
TIMER: 332519 310498 289555 272913 282535 279467 282895 270979
NET_TX: 2320 0 0 2 1 1 0 0
@@ -139,7 +139,7 @@ void raise_softirq(unsigned int nr)
Here we can see the call of the `raise_softirq_irqoff` function between the `local_irq_save` and the `local_irq_restore` macros. The `local_irq_save` defined in the [include/linux/irqflags.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/include/linux/irqflags.h) header file and saves the state of the [IF](https://en.wikipedia.org/wiki/Interrupt_flag) flag of the [eflags](https://en.wikipedia.org/wiki/FLAGS_register) register and disables interrupts on the local processor. The `local_irq_restore` macro defined in the same header file and does the opposite thing: restores the `interrupt flag` and enables interrupts. We disable interrupts here because a `softirq` interrupt runs in the interrupt context and that one softirq (and no others) will be run.
-The `raise_softirq_irqoff` function marks the softirq as deffered by setting the bit corresponding to the given index `nr` in the `softirq` bit mask (`__softirq_pending`) of the local processor. It does it with the help of the:
+The `raise_softirq_irqoff` function marks the softirq as deferred by setting the bit corresponding to the given index `nr` in the `softirq` bit mask (`__softirq_pending`) of the local processor. It does it with the help of the:
```C
__raise_softirq_irqoff(nr);
@@ -186,7 +186,7 @@ if (pending) {
--max_restart)
goto restart;
}
-...
+...
```
Checks of the existence of the deferred interrupts are performed periodically. There are several points where these checks occur. The main point is the call of the `do_IRQ` function defined in [arch/x86/kernel/irq.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/irq.c), which provides the main means for actual interrupt processing in the Linux kernel. When `do_IRQ` finishes handling an interrupt, it calls the `exiting_irq` function from the [arch/x86/include/asm/apic.h](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/include/asm/apic.h) that expands to the call of the `irq_exit` function. `irq_exit` checks for deferred interrupts and the current context and calls the `invoke_softirq` function:
@@ -283,8 +283,9 @@ As we can see this structure contains five fields, they are:
* Main callback of the tasklet;
* Parameter of the callback.
-In our case, we set only for initialize only two arrays of tasklets in the `softirq_init` function: the `tasklet_vec` and the `tasklet_hi_vec`. Tasklets and high-priority tasklets are stored in the `tasklet_vec` and `tasklet_hi_vec` arrays, respectively. So, we have initialized these arrays and now we can see two calls of the `open_softirq` function that is defined in the [kernel/softirq.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/softirq.c) source code file:
-
+In our case, we initialize only two per-CPU tasklet vectors: `tasklet_vec` for normal-priority tasklets and `tasklet_hi_vec` for high-priority tasklets. These vectors are implemented as linked lists, with each CPU maintaining its own instance.
+After setting up the tasklet vectors, we register two softirq handlers using the `open_softirq` function that is defined in the [kernel/softirq.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/kernel/softirq.c) source code file:
+
```C
open_softirq(TASKLET_SOFTIRQ, tasklet_action);
open_softirq(HI_SOFTIRQ, tasklet_hi_action);
@@ -292,7 +293,7 @@ open_softirq(HI_SOFTIRQ, tasklet_hi_action);
at the end of the `softirq_init` function. The main purpose of the `open_softirq` function is the initialization of `softirq`. Let's look on the implementation of the `open_softirq` function.
-, in our case they are: `tasklet_action` and the `tasklet_hi_action` or the `softirq` function associated with the `HI_SOFTIRQ` softirq is named `tasklet_hi_action` and `softirq` function associated with the `TASKLET_SOFTIRQ` is named `tasklet_action`. The Linux kernel provides API for the manipulating of `tasklets`. First of all it is the `tasklet_init` function that takes `tasklet_struct`, function and parameter for it and initializes the given `tasklet_struct` with the given data:
+In our case they are: `tasklet_action` and the `tasklet_hi_action` or the `softirq` function associated with the `HI_SOFTIRQ` softirq is named `tasklet_hi_action` and `softirq` function associated with the `TASKLET_SOFTIRQ` is named `tasklet_action`. The Linux kernel provides API for the manipulating of `tasklets`. First of all it is the `tasklet_init` function that takes `tasklet_struct`, function and parameter for it and initializes the given `tasklet_struct` with the given data:
```C
void tasklet_init(struct tasklet_struct *t,
@@ -310,7 +311,7 @@ There are additional methods to initialize a tasklet statically with the two fol
```C
DECLARE_TASKLET(name, func, data);
-DECLARE_TASKLET_DISABLED(name, func, data);
+DECLARE_TASKLET_DISABLED(name, func, data);
```
The Linux kernel provides three following functions to mark a tasklet as ready to run:
@@ -368,7 +369,7 @@ static void tasklet_action(struct softirq_action *a)
}
```
-In the beginning of the `tasklet_action` function, we disable interrupts for the local processor with the help of the `local_irq_disable` macro (you can read about this macro in the second [part](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-2) of this chapter). In the next step, we take a head of the list that contains tasklets with normal priority and set this per-cpu list to `NULL` because all tasklets must be executed in a generally way. After this we enable interrupts for the local processor and go through the list of tasklets in the loop. In every iteration of the loop we call the `tasklet_trylock` function for the given tasklet that updates state of the given tasklet on `TASKLET_STATE_RUN`:
+In the beginning of the `tasklet_action` function, we disable interrupts for the local processor with the help of the `local_irq_disable` macro (you can read about this macro in the second [part](https://0xax.gitbook.io/linux-insides/summary/interrupts/linux-interrupts-2) of this chapter). In the next step, we take a head of the list that contains tasklets with normal priority and set this per-cpu list to `NULL` because all tasklets must be executed in a general way. After this we enable interrupts for the local processor and go through the list of tasklets in the loop. In every iteration of the loop we call the `tasklet_trylock` function for the given tasklet that updates state of the given tasklet on `TASKLET_STATE_RUN`:
```C
static inline int tasklet_trylock(struct tasklet_struct *t)
diff --git a/KernelStructures/linux-kernelstructure-1.md b/KernelStructures/linux-kernelstructure-1.md
index 53743501..72377f3e 100644
--- a/KernelStructures/linux-kernelstructure-1.md
+++ b/KernelStructures/linux-kernelstructure-1.md
@@ -13,9 +13,9 @@ Types of Exceptions:
* Traps - are precise exceptions reported on the boundary `following` the instruction causing the exception. The same with `%rip`;
* Aborts - are imprecise exceptions. Because they are imprecise, aborts typically do not allow reliable program restart.
-`Maskable` interrupts trigger the interrupt-handling mechanism only when RFLAGS.IF=1. Otherwise they are held pending for as long as the RFLAGS.IF bit is cleared to 0.
+`Maskable` interrupts trigger the interrupt-handling mechanism only when `RFLAGS.IF=1`. Otherwise they are held pending for as long as the `RFLAGS.IF` bit is cleared to 0.
-`Nonmaskable` interrupts (NMI) are unaffected by the value of the rFLAGS.IF bit. However, the occurrence of an NMI masks further NMIs until an IRET instruction is executed.
+`Nonmaskable` interrupts (NMI) are unaffected by the value of the 'RFLAGS.IF' bit. However, the occurrence of an NMI masks further NMIs until an IRET instruction is executed.
Specific exception and interrupt sources are assigned a fixed vector-identification number (also called an “interrupt vector” or simply “vector”). The interrupt vector is used by the interrupt-handling mechanism to locate the system-software service routine assigned to the exception or interrupt. Up to
256 unique interrupt vectors are available. The first 32 vectors are reserved for predefined exception and interrupt conditions. They are defined in the [arch/x86/include/asm/traps.h](http://lxr.free-electrons.com/source/arch/x86/include/asm/traps.h#L121) header file:
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 00000000..d60efe46
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,437 @@
+Attribution-NonCommercial-ShareAlike 4.0 International
+
+=======================================================================
+
+Creative Commons Corporation ("Creative Commons") is not a law firm and
+does not provide legal services or legal advice. Distribution of
+Creative Commons public licenses does not create a lawyer-client or
+other relationship. Creative Commons makes its licenses and related
+information available on an "as-is" basis. Creative Commons gives no
+warranties regarding its licenses, any material licensed under their
+terms and conditions, or any related information. Creative Commons
+disclaims all liability for damages resulting from their use to the
+fullest extent possible.
+
+Using Creative Commons Public Licenses
+
+Creative Commons public licenses provide a standard set of terms and
+conditions that creators and other rights holders may use to share
+original works of authorship and other material subject to copyright
+and certain other rights specified in the public license below. The
+following considerations are for informational purposes only, are not
+exhaustive, and do not form part of our licenses.
+
+ Considerations for licensors: Our public licenses are
+ intended for use by those authorized to give the public
+ permission to use material in ways otherwise restricted by
+ copyright and certain other rights. Our licenses are
+ irrevocable. Licensors should read and understand the terms
+ and conditions of the license they choose before applying it.
+ Licensors should also secure all rights necessary before
+ applying our licenses so that the public can reuse the
+ material as expected. Licensors should clearly mark any
+ material not subject to the license. This includes other CC-
+ licensed material, or material used under an exception or
+ limitation to copyright. More considerations for licensors:
+ wiki.creativecommons.org/Considerations_for_licensors
+
+ Considerations for the public: By using one of our public
+ licenses, a licensor grants the public permission to use the
+ licensed material under specified terms and conditions. If
+ the licensor's permission is not necessary for any reason--for
+ example, because of any applicable exception or limitation to
+ copyright--then that use is not regulated by the license. Our
+ licenses grant only permissions under copyright and certain
+ other rights that a licensor has authority to grant. Use of
+ the licensed material may still be restricted for other
+ reasons, including because others have copyright or other
+ rights in the material. A licensor may make special requests,
+ such as asking that all changes be marked or described.
+ Although not required by our licenses, you are encouraged to
+ respect those requests where reasonable. More_considerations
+ for the public:
+ wiki.creativecommons.org/Considerations_for_licensees
+
+=======================================================================
+
+Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International
+Public License
+
+By exercising the Licensed Rights (defined below), You accept and agree
+to be bound by the terms and conditions of this Creative Commons
+Attribution-NonCommercial-ShareAlike 4.0 International Public License
+("Public License"). To the extent this Public License may be
+interpreted as a contract, You are granted the Licensed Rights in
+consideration of Your acceptance of these terms and conditions, and the
+Licensor grants You such rights in consideration of benefits the
+Licensor receives from making the Licensed Material available under
+these terms and conditions.
+
+
+Section 1 -- Definitions.
+
+ a. Adapted Material means material subject to Copyright and Similar
+ Rights that is derived from or based upon the Licensed Material
+ and in which the Licensed Material is translated, altered,
+ arranged, transformed, or otherwise modified in a manner requiring
+ permission under the Copyright and Similar Rights held by the
+ Licensor. For purposes of this Public License, where the Licensed
+ Material is a musical work, performance, or sound recording,
+ Adapted Material is always produced where the Licensed Material is
+ synched in timed relation with a moving image.
+
+ b. Adapter's License means the license You apply to Your Copyright
+ and Similar Rights in Your contributions to Adapted Material in
+ accordance with the terms and conditions of this Public License.
+
+ c. BY-NC-SA Compatible License means a license listed at
+ creativecommons.org/compatiblelicenses, approved by Creative
+ Commons as essentially the equivalent of this Public License.
+
+ d. Copyright and Similar Rights means copyright and/or similar rights
+ closely related to copyright including, without limitation,
+ performance, broadcast, sound recording, and Sui Generis Database
+ Rights, without regard to how the rights are labeled or
+ categorized. For purposes of this Public License, the rights
+ specified in Section 2(b)(1)-(2) are not Copyright and Similar
+ Rights.
+
+ e. Effective Technological Measures means those measures that, in the
+ absence of proper authority, may not be circumvented under laws
+ fulfilling obligations under Article 11 of the WIPO Copyright
+ Treaty adopted on December 20, 1996, and/or similar international
+ agreements.
+
+ f. Exceptions and Limitations means fair use, fair dealing, and/or
+ any other exception or limitation to Copyright and Similar Rights
+ that applies to Your use of the Licensed Material.
+
+ g. License Elements means the license attributes listed in the name
+ of a Creative Commons Public License. The License Elements of this
+ Public License are Attribution, NonCommercial, and ShareAlike.
+
+ h. Licensed Material means the artistic or literary work, database,
+ or other material to which the Licensor applied this Public
+ License.
+
+ i. Licensed Rights means the rights granted to You subject to the
+ terms and conditions of this Public License, which are limited to
+ all Copyright and Similar Rights that apply to Your use of the
+ Licensed Material and that the Licensor has authority to license.
+
+ j. Licensor means the individual(s) or entity(ies) granting rights
+ under this Public License.
+
+ k. NonCommercial means not primarily intended for or directed towards
+ commercial advantage or monetary compensation. For purposes of
+ this Public License, the exchange of the Licensed Material for
+ other material subject to Copyright and Similar Rights by digital
+ file-sharing or similar means is NonCommercial provided there is
+ no payment of monetary compensation in connection with the
+ exchange.
+
+ l. Share means to provide material to the public by any means or
+ process that requires permission under the Licensed Rights, such
+ as reproduction, public display, public performance, distribution,
+ dissemination, communication, or importation, and to make material
+ available to the public including in ways that members of the
+ public may access the material from a place and at a time
+ individually chosen by them.
+
+ m. Sui Generis Database Rights means rights other than copyright
+ resulting from Directive 96/9/EC of the European Parliament and of
+ the Council of 11 March 1996 on the legal protection of databases,
+ as amended and/or succeeded, as well as other essentially
+ equivalent rights anywhere in the world.
+
+ n. You means the individual or entity exercising the Licensed Rights
+ under this Public License. Your has a corresponding meaning.
+
+
+Section 2 -- Scope.
+
+ a. License grant.
+
+ 1. Subject to the terms and conditions of this Public License,
+ the Licensor hereby grants You a worldwide, royalty-free,
+ non-sublicensable, non-exclusive, irrevocable license to
+ exercise the Licensed Rights in the Licensed Material to:
+
+ a. reproduce and Share the Licensed Material, in whole or
+ in part, for NonCommercial purposes only; and
+
+ b. produce, reproduce, and Share Adapted Material for
+ NonCommercial purposes only.
+
+ 2. Exceptions and Limitations. For the avoidance of doubt, where
+ Exceptions and Limitations apply to Your use, this Public
+ License does not apply, and You do not need to comply with
+ its terms and conditions.
+
+ 3. Term. The term of this Public License is specified in Section
+ 6(a).
+
+ 4. Media and formats; technical modifications allowed. The
+ Licensor authorizes You to exercise the Licensed Rights in
+ all media and formats whether now known or hereafter created,
+ and to make technical modifications necessary to do so. The
+ Licensor waives and/or agrees not to assert any right or
+ authority to forbid You from making technical modifications
+ necessary to exercise the Licensed Rights, including
+ technical modifications necessary to circumvent Effective
+ Technological Measures. For purposes of this Public License,
+ simply making modifications authorized by this Section 2(a)
+ (4) never produces Adapted Material.
+
+ 5. Downstream recipients.
+
+ a. Offer from the Licensor -- Licensed Material. Every
+ recipient of the Licensed Material automatically
+ receives an offer from the Licensor to exercise the
+ Licensed Rights under the terms and conditions of this
+ Public License.
+
+ b. Additional offer from the Licensor -- Adapted Material.
+ Every recipient of Adapted Material from You
+ automatically receives an offer from the Licensor to
+ exercise the Licensed Rights in the Adapted Material
+ under the conditions of the Adapter's License You apply.
+
+ c. No downstream restrictions. You may not offer or impose
+ any additional or different terms or conditions on, or
+ apply any Effective Technological Measures to, the
+ Licensed Material if doing so restricts exercise of the
+ Licensed Rights by any recipient of the Licensed
+ Material.
+
+ 6. No endorsement. Nothing in this Public License constitutes or
+ may be construed as permission to assert or imply that You
+ are, or that Your use of the Licensed Material is, connected
+ with, or sponsored, endorsed, or granted official status by,
+ the Licensor or others designated to receive attribution as
+ provided in Section 3(a)(1)(A)(i).
+
+ b. Other rights.
+
+ 1. Moral rights, such as the right of integrity, are not
+ licensed under this Public License, nor are publicity,
+ privacy, and/or other similar personality rights; however, to
+ the extent possible, the Licensor waives and/or agrees not to
+ assert any such rights held by the Licensor to the limited
+ extent necessary to allow You to exercise the Licensed
+ Rights, but not otherwise.
+
+ 2. Patent and trademark rights are not licensed under this
+ Public License.
+
+ 3. To the extent possible, the Licensor waives any right to
+ collect royalties from You for the exercise of the Licensed
+ Rights, whether directly or through a collecting society
+ under any voluntary or waivable statutory or compulsory
+ licensing scheme. In all other cases the Licensor expressly
+ reserves any right to collect such royalties, including when
+ the Licensed Material is used other than for NonCommercial
+ purposes.
+
+
+Section 3 -- License Conditions.
+
+Your exercise of the Licensed Rights is expressly made subject to the
+following conditions.
+
+ a. Attribution.
+
+ 1. If You Share the Licensed Material (including in modified
+ form), You must:
+
+ a. retain the following if it is supplied by the Licensor
+ with the Licensed Material:
+
+ i. identification of the creator(s) of the Licensed
+ Material and any others designated to receive
+ attribution, in any reasonable manner requested by
+ the Licensor (including by pseudonym if
+ designated);
+
+ ii. a copyright notice;
+
+ iii. a notice that refers to this Public License;
+
+ iv. a notice that refers to the disclaimer of
+ warranties;
+
+ v. a URI or hyperlink to the Licensed Material to the
+ extent reasonably practicable;
+
+ b. indicate if You modified the Licensed Material and
+ retain an indication of any previous modifications; and
+
+ c. indicate the Licensed Material is licensed under this
+ Public License, and include the text of, or the URI or
+ hyperlink to, this Public License.
+
+ 2. You may satisfy the conditions in Section 3(a)(1) in any
+ reasonable manner based on the medium, means, and context in
+ which You Share the Licensed Material. For example, it may be
+ reasonable to satisfy the conditions by providing a URI or
+ hyperlink to a resource that includes the required
+ information.
+ 3. If requested by the Licensor, You must remove any of the
+ information required by Section 3(a)(1)(A) to the extent
+ reasonably practicable.
+
+ b. ShareAlike.
+
+ In addition to the conditions in Section 3(a), if You Share
+ Adapted Material You produce, the following conditions also apply.
+
+ 1. The Adapter's License You apply must be a Creative Commons
+ license with the same License Elements, this version or
+ later, or a BY-NC-SA Compatible License.
+
+ 2. You must include the text of, or the URI or hyperlink to, the
+ Adapter's License You apply. You may satisfy this condition
+ in any reasonable manner based on the medium, means, and
+ context in which You Share Adapted Material.
+
+ 3. You may not offer or impose any additional or different terms
+ or conditions on, or apply any Effective Technological
+ Measures to, Adapted Material that restrict exercise of the
+ rights granted under the Adapter's License You apply.
+
+
+Section 4 -- Sui Generis Database Rights.
+
+Where the Licensed Rights include Sui Generis Database Rights that
+apply to Your use of the Licensed Material:
+
+ a. for the avoidance of doubt, Section 2(a)(1) grants You the right
+ to extract, reuse, reproduce, and Share all or a substantial
+ portion of the contents of the database for NonCommercial purposes
+ only;
+
+ b. if You include all or a substantial portion of the database
+ contents in a database in which You have Sui Generis Database
+ Rights, then the database in which You have Sui Generis Database
+ Rights (but not its individual contents) is Adapted Material,
+ including for purposes of Section 3(b); and
+
+ c. You must comply with the conditions in Section 3(a) if You Share
+ all or a substantial portion of the contents of the database.
+
+For the avoidance of doubt, this Section 4 supplements and does not
+replace Your obligations under this Public License where the Licensed
+Rights include other Copyright and Similar Rights.
+
+
+Section 5 -- Disclaimer of Warranties and Limitation of Liability.
+
+ a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
+ EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
+ AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
+ ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
+ IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
+ WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
+ PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
+ ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
+ KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
+ ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
+
+ b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
+ TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
+ NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
+ INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
+ COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
+ USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
+ ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
+ DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
+ IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
+
+ c. The disclaimer of warranties and limitation of liability provided
+ above shall be interpreted in a manner that, to the extent
+ possible, most closely approximates an absolute disclaimer and
+ waiver of all liability.
+
+
+Section 6 -- Term and Termination.
+
+ a. This Public License applies for the term of the Copyright and
+ Similar Rights licensed here. However, if You fail to comply with
+ this Public License, then Your rights under this Public License
+ terminate automatically.
+
+ b. Where Your right to use the Licensed Material has terminated under
+ Section 6(a), it reinstates:
+
+ 1. automatically as of the date the violation is cured, provided
+ it is cured within 30 days of Your discovery of the
+ violation; or
+
+ 2. upon express reinstatement by the Licensor.
+
+ For the avoidance of doubt, this Section 6(b) does not affect any
+ right the Licensor may have to seek remedies for Your violations
+ of this Public License.
+
+ c. For the avoidance of doubt, the Licensor may also offer the
+ Licensed Material under separate terms or conditions or stop
+ distributing the Licensed Material at any time; however, doing so
+ will not terminate this Public License.
+
+ d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
+ License.
+
+
+Section 7 -- Other Terms and Conditions.
+
+ a. The Licensor shall not be bound by any additional or different
+ terms or conditions communicated by You unless expressly agreed.
+
+ b. Any arrangements, understandings, or agreements regarding the
+ Licensed Material not stated herein are separate from and
+ independent of the terms and conditions of this Public License.
+
+
+Section 8 -- Interpretation.
+
+ a. For the avoidance of doubt, this Public License does not, and
+ shall not be interpreted to, reduce, limit, restrict, or impose
+ conditions on any use of the Licensed Material that could lawfully
+ be made without permission under this Public License.
+
+ b. To the extent possible, if any provision of this Public License is
+ deemed unenforceable, it shall be automatically reformed to the
+ minimum extent necessary to make it enforceable. If the provision
+ cannot be reformed, it shall be severed from this Public License
+ without affecting the enforceability of the remaining terms and
+ conditions.
+
+ c. No term or condition of this Public License will be waived and no
+ failure to comply consented to unless expressly agreed to by the
+ Licensor.
+
+ d. Nothing in this Public License constitutes or may be interpreted
+ as a limitation upon, or waiver of, any privileges and immunities
+ that apply to the Licensor or You, including from the legal
+ processes of any jurisdiction or authority.
+
+=======================================================================
+
+Creative Commons is not a party to its public
+licenses. Notwithstanding, Creative Commons may elect to apply one of
+its public licenses to material it publishes and in those instances
+will be considered the “Licensor.” The text of the Creative Commons
+public licenses is dedicated to the public domain under the CC0 Public
+Domain Dedication. Except for the limited purpose of indicating that
+material is shared under a Creative Commons public license or as
+otherwise permitted by the Creative Commons policies published at
+creativecommons.org/policies, Creative Commons does not authorize the
+use of the trademark "Creative Commons" or any other trademark or logo
+of Creative Commons without its prior written consent including,
+without limitation, in connection with any unauthorized modifications
+to any of its public licenses or any other arrangements,
+understandings, or agreements concerning use of licensed material. For
+the avoidance of doubt, this paragraph does not form part of the
+public licenses.
+
+Creative Commons may be contacted at creativecommons.org.
diff --git a/MM/README.md b/MM/README.md
index fbf3d7af..d5f5dada 100644
--- a/MM/README.md
+++ b/MM/README.md
@@ -1,7 +1,7 @@
# Linux kernel memory management
-This chapter describes memory management in the linux kernel. You will see here a
-couple of posts which describe different parts of the linux memory management framework:
+This chapter describes memory management in the Linux kernel. You will see here a
+couple of posts which describe different parts of the Linux memory management framework:
* [Memblock](linux-mm-1.md) - describes early `memblock` allocator.
* [Fix-Mapped Addresses and ioremap](linux-mm-2.md) - describes `fix-mapped` addresses and early `ioremap`.
diff --git a/MM/images/kernel_configuration_menu1.png b/MM/images/kernel_configuration_menu1.png
index 54ef0ac5..bcf59f29 100644
Binary files a/MM/images/kernel_configuration_menu1.png and b/MM/images/kernel_configuration_menu1.png differ
diff --git a/MM/images/kernel_configuration_menu2.png b/MM/images/kernel_configuration_menu2.png
index 6981c94c..aa8d8912 100644
Binary files a/MM/images/kernel_configuration_menu2.png and b/MM/images/kernel_configuration_menu2.png differ
diff --git a/MM/images/memblock.png b/MM/images/memblock.png
index 6e6279bc..6bfe5969 100644
Binary files a/MM/images/memblock.png and b/MM/images/memblock.png differ
diff --git a/MM/linux-mm-1.md b/MM/linux-mm-1.md
index b34ba96e..79db4440 100644
--- a/MM/linux-mm-1.md
+++ b/MM/linux-mm-1.md
@@ -4,7 +4,7 @@ Linux kernel memory management Part 1.
Introduction
--------------------------------------------------------------------------------
-Memory management is one of the most complex (and I think that it is the most complex) part of the operating system kernel. In the [last preparations before the kernel entry point](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-3) part we stopped right before call of the `start_kernel` function. This function initializes all the kernel features (including architecture-dependent features) before the kernel runs the first `init` process. You may remember as we built early page tables, identity page tables and fixmap page tables in the boot time. No complicated memory management is working yet. When the `start_kernel` function is called we will see the transition to more complex data structures and techniques for memory management. For a good understanding of the initialization process in the linux kernel we need to have a clear understanding of these techniques. This chapter will provide an overview of the different parts of the linux kernel memory management framework and its API, starting from the `memblock`.
+Memory management is one of the most complex (and I think that it is the most complex) part of the operating system kernel. In the [last preparations before the kernel entry point](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-3) part we stopped right before call of the `start_kernel` function. This function initializes all the kernel features (including architecture-dependent features) before the kernel runs the first `init` process. You may remember as we built early page tables, identity page tables and fixmap page tables in the boot time. No complicated memory management is working yet. When the `start_kernel` function is called we will see the transition to more complex data structures and techniques for memory management. For a good understanding of the initialization process in the Linux kernel we need to have a clear understanding of these techniques. This chapter will provide an overview of the different parts of the linux kernel memory management framework and its API, starting from the `memblock`.
Memblock
--------------------------------------------------------------------------------
@@ -155,7 +155,7 @@ On this step the initialization of the `memblock` structure has been finished an
Memblock API
--------------------------------------------------------------------------------
-Ok we have finished with the initialization of the `memblock` structure and now we can look at the Memblock API and its implementation. As I said above, the implementation of `memblock` is taking place fully in [mm/memblock.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/mm/memblock.c). To understand how `memblock` works and how it is implemented, let's look at its usage first. There are a couple of [places](http://lxr.free-electrons.com/ident?i=memblock) in the linux kernel where memblock is used. For example let's take `memblock_x86_fill` function from the [arch/x86/kernel/e820.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/e820.c#L1061). This function goes through the memory map provided by the [e820](http://en.wikipedia.org/wiki/E820) and adds memory regions reserved by the kernel to the `memblock` with the `memblock_add` function. Since we have met the `memblock_add` function first, let's start from it.
+Ok we have finished with the initialization of the `memblock` structure and now we can look at the Memblock API and its implementation. As I said above, the implementation of `memblock` is taking place fully in [mm/memblock.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/mm/memblock.c). To understand how `memblock` works and how it is implemented, let's look at its usage first. There are a couple of [places](http://lxr.free-electrons.com/ident?i=memblock) in the Linux kernel where memblock is used. For example let's take `memblock_x86_fill` function from the [arch/x86/kernel/e820.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/kernel/e820.c#L1061). This function goes through the memory map provided by the [e820](http://en.wikipedia.org/wiki/E820) and adds memory regions reserved by the kernel to the `memblock` with the `memblock_add` function. Since we have met the `memblock_add` function first, let's start from it.
This function takes a physical base address and the size of the memory region as arguments and add them to the `memblock`. The `memblock_add` function does not do anything special in its body, but just calls the:
@@ -163,7 +163,7 @@ This function takes a physical base address and the size of the memory region as
memblock_add_range(&memblock.memory, base, size, MAX_NUMNODES, 0);
```
-function. We pass the memory block type - `memory`, the physical base address and the size of the memory region, the maximum number of nodes which is 1 if `CONFIG_NODES_SHIFT` is not set in the configuration file or `1 << CONFIG_NODES_SHIFT` if it is set, and the flags. The `memblock_add_range` function adds a new memory region to the memory block. It starts by checking the size of the given region and if it is zero it just returns. After this, `memblock_add_range` checks the existence of the memory regions in the `memblock` structure with the given `memblock_type`. If there are no memory regions, we just fill a new `memory_region` with the given values and return (we already saw the implementation of this in the [First touch of the linux kernel memory manager framework](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-3)). If `memblock_type` is not empty, we start to add a new memory region to the `memblock` with the given `memblock_type`.
+function. We pass the memory block type - `memory`, the physical base address and the size of the memory region, the maximum number of nodes which is 1 if `CONFIG_NODES_SHIFT` is not set in the configuration file or `1 << CONFIG_NODES_SHIFT` if it is set, and the flags. The `memblock_add_range` function adds a new memory region to the memory block. It starts by checking the size of the given region and if it is zero it just returns. After this, `memblock_add_range` checks the existence of the memory regions in the `memblock` structure with the given `memblock_type`. If there are no memory regions, we just fill a new `memory_region` with the given values and return (we already saw the implementation of this in the [First touch of the Linux kernel memory manager framework](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-3)). If `memblock_type` is not empty, we start to add a new memory region to the `memblock` with the given `memblock_type`.
First of all we get the end of the memory region with the:
@@ -171,7 +171,7 @@ First of all we get the end of the memory region with the:
phys_addr_t end = base + memblock_cap_size(base, &size);
```
-`memblock_cap_size` adjusts `size` that `base + size` will not overflow. Its implementation is pretty easy:
+`memblock_cap_size` adjusts `size` so that `base + size` will not overflow. Its implementation is pretty easy:
```C
static inline phys_addr_t memblock_cap_size(phys_addr_t base, phys_addr_t *size)
@@ -337,10 +337,10 @@ There is also `memblock_reserve` function which does the same as `memblock_add`,
Of course this is not the full API. Memblock provides APIs not only for adding `memory` and `reserved` memory regions, but also:
-* memblock_remove - removes memory region from memblock;
-* memblock_find_in_range - finds free area in given range;
-* memblock_free - releases memory region in memblock;
-* for_each_mem_range - iterates through memblock areas.
+* `memblock_remove` - removes memory region from memblock;
+* `memblock_find_in_range` - finds free area in given range;
+* `memblock_free` - releases memory region in memblock;
+* `for_each_mem_range` - iterates through memblock areas.
and many more....
@@ -349,8 +349,8 @@ Getting info about memory regions
Memblock also provides an API for getting information about allocated memory regions in the `memblock`. It is split in two parts:
-* get_allocated_memblock_memory_regions_info - getting info about memory regions;
-* get_allocated_memblock_reserved_regions_info - getting info about reserved regions.
+* `get_allocated_memblock_memory_regions_info` - getting info about memory regions;
+* `get_allocated_memblock_reserved_regions_info` - getting info about reserved regions.
Implementation of these functions is easy. Let's look at `get_allocated_memblock_reserved_regions_info` for example:
@@ -401,16 +401,16 @@ And you will see something like this:
Memblock also has support in [debugfs](http://en.wikipedia.org/wiki/Debugfs). If you run the kernel on another architecture than `X86` you can access:
-* /sys/kernel/debug/memblock/memory
-* /sys/kernel/debug/memblock/reserved
-* /sys/kernel/debug/memblock/physmem
+* `/sys/kernel/debug/memblock/memory`
+* `/sys/kernel/debug/memblock/reserved`
+* `/sys/kernel/debug/memblock/physmem`
to get a dump of the `memblock` contents.
Conclusion
--------------------------------------------------------------------------------
-This is the end of the first part about linux kernel memory management. If you have questions or suggestions, ping me on twitter [0xAX](https://twitter.com/0xAX), drop me an [email](mailto:anotherworldofworld@gmail.com) or just create an [issue](https://github.com/0xAX/linux-insides/issues/new).
+This is the end of the first part about Linux kernel memory management. If you have questions or suggestions, ping me on twitter [0xAX](https://twitter.com/0xAX), drop me an [email](mailto:anotherworldofworld@gmail.com) or just create an [issue](https://github.com/0xAX/linux-insides/issues/new).
**Please note that English is not my first language and I am really sorry for any inconvenience. If you found any mistakes please send me a PR to [linux-insides](https://github.com/0xAX/linux-insides).**
@@ -420,4 +420,4 @@ Links
* [e820](http://en.wikipedia.org/wiki/E820)
* [numa](http://en.wikipedia.org/wiki/Non-uniform_memory_access)
* [debugfs](http://en.wikipedia.org/wiki/Debugfs)
-* [First touch of the linux kernel memory manager framework](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-3)
+* [First touch of the Linux kernel memory manager framework](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-3)
diff --git a/MM/linux-mm-2.md b/MM/linux-mm-2.md
index 2b2fca0f..dd54943e 100644
--- a/MM/linux-mm-2.md
+++ b/MM/linux-mm-2.md
@@ -36,9 +36,9 @@ Base virtual address and size of the `fix-mapped` area are presented by the two
#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE)
```
-Here `__end_of_permanent_fixed_addresses` is an element of the `fixed_addresses` enum and as I wrote above: Every fix-mapped address is represented by an integer index which is defined in the `fixed_addresses`. `PAGE_SHIFT` determines the size of a page. For example size of the one page we can get with the `1 << PAGE_SHIFT` expression.
+Here `__end_of_permanent_fixed_addresses` is an element of the `fixed_addresses` enum and as I wrote above, every fix-mapped address is represented by an integer index which is defined in the `fixed_addresses`. `PAGE_SHIFT` determines the size of a page. For example size of the one page we can get with the `1 << PAGE_SHIFT` expression.
-In our case we need to get the size of the fix-mapped area, but not only of one page, that's why we are using `__end_of_permanent_fixed_addresses` for getting the size of the fix-mapped area. The `__end_of_permanent_fixed_addresses` is the last index of the `fixed_addresses` enum or in other words the `__end_of_permanent_fixed_addresses` contains amount of pages in a fixed-mapped area. So if multiply value of the `__end_of_permanent_fixed_addresses` on a page size value we will get size of fix-mapped area. In my case it's a little more than `536` kilobytes. In your case it might be a different number, because the size depends on amount of the fix-mapped addresses which are depends on your kernel's configuration.
+In our case we need to get the size of the fix-mapped area, but not only of one page, that's why we are using `__end_of_permanent_fixed_addresses` for getting the size of the fix-mapped area. The `__end_of_permanent_fixed_addresses` is the last index of the `fixed_addresses` enum or in other words the `__end_of_permanent_fixed_addresses` contains amount of pages in a fixed-mapped area. So if we multiply the value of the `__end_of_permanent_fixed_addresses` on a page size value we will get size of fix-mapped area. In my case it's a little more than `536` kilobytes. In your case it might be a different number, because the size depends on amount of the fix-mapped addresses which depends on your kernel configuration.
The second `FIXADDR_START` macro just subtracts the fix-mapped area size from the last address of the fix-mapped area to get its base virtual address. `FIXADDR_TOP` is a rounded up address from the base address of the [vsyscall](https://lwn.net/Articles/446528/) space:
@@ -46,8 +46,8 @@ The second `FIXADDR_START` macro just subtracts the fix-mapped area size from th
#define FIXADDR_TOP (round_up(VSYSCALL_ADDR + PAGE_SIZE, 1< Memory Debugging
```
-
+
menu of the Linux kernel configuration:

@@ -140,7 +140,7 @@ config X86
...
```
-So, there is no anything which is specific for other architectures.
+So, there isn't anything which is specific for other architectures.
Ok, so we know that `kmemcheck` provides mechanism to check usage of `uninitialized memory` in the Linux kernel and how to enable it. How it does these checks? When the Linux kernel tries to allocate some memory i.e. something is called like this:
@@ -148,7 +148,7 @@ Ok, so we know that `kmemcheck` provides mechanism to check usage of `uninitiali
struct my_struct *my_struct = kmalloc(sizeof(struct my_struct), GFP_KERNEL);
```
-or in other words somebody wants to access a [page](https://en.wikipedia.org/wiki/Page_%28computer_memory%29), a [page fault](https://en.wikipedia.org/wiki/Page_fault) exception is generated. This is achieved by the fact that the `kmemcheck` marks memory pages as `non-present` (more about this you can read in the special part which is devoted to [Paging](https://0xax.gitbook.io/linux-insides/summary/theory/linux-theory-1)). If a `page fault` exception is occurred, the exception handler knows about it and in a case when the `kmemcheck` is enabled it transfers control to it. After the `kmemcheck` will finish its checks, the page will be marked as `present` and the interrupted code will be able to continue execution. There is little subtlety in this chain. When the first instruction of interrupted code will be executed, the `kmemcheck` will mark the page as `non-present` again. In this way next access to memory will be caught again.
+or in other words somebody wants to access a [page](https://en.wikipedia.org/wiki/Page_%28computer_memory%29), a [page fault](https://en.wikipedia.org/wiki/Page_fault) exception is generated. This is achieved by the fact that the `kmemcheck` marks memory pages as `non-present` (more about this you can read in the special part which is devoted to [Paging](https://0xax.gitbook.io/linux-insides/summary/theory/linux-theory-1)). If a `page fault` exception occurred, the exception handler knows about it and in a case when the `kmemcheck` is enabled it transfers control to it. After the `kmemcheck` will finish its checks, the page will be marked as `present` and the interrupted code will be able to continue execution. There is little subtlety in this chain. When the first instruction of interrupted code will be executed, the `kmemcheck` will mark the page as `non-present` again. In this way next access to memory will be caught again.
We just considered the `kmemcheck` mechanism from theoretical side. Now let's consider how it is implemented in the Linux kernel.
@@ -215,9 +215,9 @@ if (!kmemcheck_selftest()) {
printk(KERN_INFO "kmemcheck: Initialized\n");
```
-and return with the `EINVAL` if this check is failed. The `kmemcheck_selftest` function checks sizes of different memory access related [opcodes](https://en.wikipedia.org/wiki/Opcode) like `rep movsb`, `movzwq` and etc. If sizes of opcodes are equal to expected sizes, the `kmemcheck_selftest` will return `true` and `false` in other way.
+and return with the `EINVAL` if this check is failed. The `kmemcheck_selftest` function checks sizes of different memory access related [opcodes](https://en.wikipedia.org/wiki/Opcode) like `rep movsb`, `movzwq` and etc. If sizes of opcodes are equal to expected sizes, the `kmemcheck_selftest` will return `true` and `false` otherwise.
-So when the somebody will call:
+So when somebody calls:
```C
struct my_struct *my_struct = kmalloc(sizeof(struct my_struct), GFP_KERNEL);
@@ -236,7 +236,7 @@ if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
}
```
-So, here we check that the if `kmemcheck` is enabled and the `SLAB_NOTRACK` bit is not set in flags we set `non-present` bit for the just allocated page. The `SLAB_NOTRACK` bit tell us to not track uninitialized memory. Additionally we check if a cache object has constructor (details will be considered in next parts) we mark allocated page as uninitialized or unallocated in other way. The `kmemcheck_alloc_shadow` function is defined in the [mm/kmemcheck.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/mm/kmemcheck.c) source code file and does following things:
+So, here we check that the if `kmemcheck` is enabled and the `SLAB_NOTRACK` bit is not set in flags we set `non-present` bit for the just allocated page. The `SLAB_NOTRACK` bit tell us to not track uninitialized memory. Additionally we check if a cache object has constructor (details will be considered in next parts) we mark allocated page as uninitialized or unallocated otherwise. The `kmemcheck_alloc_shadow` function is defined in the [mm/kmemcheck.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/mm/kmemcheck.c) source code file and does following things:
```C
void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node)
@@ -276,7 +276,7 @@ void kmemcheck_hide_pages(struct page *p, unsigned int n)
}
```
-Here we go through all pages and and tries to get `page table entry` for each page. If this operation was successful, we unset present bit and set hidden bit in each page. In the end we flush [translation lookaside buffer](https://en.wikipedia.org/wiki/Translation_lookaside_buffer), because some pages was changed. From this point allocated pages are tracked by the `kmemcheck`. Now, as `present` bit is unset, the [page fault](https://en.wikipedia.org/wiki/Page_fault) execution will be occurred right after the `kmalloc` will return pointer to allocated space and a code will try to access this memory.
+Here we go through all pages and try to get `page table entry` for each page. If this operation was successful, we unset present bit and set hidden bit in each page. In the end we flush [translation lookaside buffer](https://en.wikipedia.org/wiki/Translation_lookaside_buffer), because some pages was changed. From this point allocated pages are tracked by the `kmemcheck`. Now, as `present` bit is unset, the [page fault](https://en.wikipedia.org/wiki/Page_fault) execution will be occurred right after the `kmalloc` will return pointer to allocated space and a code will try to access this memory.
As you may remember from the [second part](https://0xax.gitbook.io/linux-insides/summary/initialization/linux-initialization-2) of the Linux kernel initialization chapter, the `page fault` handler is located in the [arch/x86/mm/fault.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/mm/fault.c) source code file and represented by the `do_page_fault` function. We can see following check from the beginning of the `do_page_fault` function:
@@ -296,7 +296,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
}
```
-The `kmemcheck_active` gets `kmemcheck_context` [per-cpu](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-1) structure and return the result of comparison of the `balance` field of this structure with zero:
+The `kmemcheck_active` gets `kmemcheck_context` [per-cpu](https://0xax.gitbook.io/linux-insides/summary/concepts/linux-cpu-1) structure and returns the result of comparison of the `balance` field of this structure with zero:
```
bool kmemcheck_active(struct pt_regs *regs)
@@ -314,7 +314,7 @@ if (kmemcheck_fault(regs, address, error_code))
return;
```
-First of all the `kmemcheck_fault` function checks that the fault was occurred by the correct reason. At first we check the [flags register](https://en.wikipedia.org/wiki/FLAGS_register) and check that we are in normal kernel mode:
+First of all the `kmemcheck_fault` function checks that the fault occurred by the correct reason. At first we check the [flags register](https://en.wikipedia.org/wiki/FLAGS_register) and check that we are in normal kernel mode:
```C
if (regs->flags & X86_VM_MASK)
@@ -323,7 +323,7 @@ if (regs->cs != __KERNEL_CS)
return false;
```
-If these checks wasn't successful we return from the `kmemcheck_fault` function as it was not `kmemcheck` related page fault. After this we try to lookup a `page table entry` related to the faulted address and if we can't find it we return:
+If these checks weren't successful we return from the `kmemcheck_fault` function as it was not `kmemcheck` related page fault. After this we try to lookup a `page table entry` related to the faulted address and if we can't find it we return:
```C
pte = kmemcheck_pte_lookup(address);
@@ -331,7 +331,7 @@ if (!pte)
return false;
```
-Last two steps of the `kmemcheck_fault` function is to call the `kmemcheck_access` function which check access to the given page and show addresses again by setting present bit in the given page. The `kmemcheck_access` function does all main job. It check current instruction which caused a page fault. If it will find an error, the context of this error will be saved by `kmemcheck` to the ring queue:
+Last two steps of the `kmemcheck_fault` function is to call the `kmemcheck_access` function which check access to the given page and show addresses again by setting present bit in the given page. The `kmemcheck_access` function does all main job. It checks current instruction which caused a page fault. If it finds an error, the context of this error will be saved by `kmemcheck` to the ring queue:
```C
static struct kmemcheck_error error_fifo[CONFIG_KMEMCHECK_QUEUE_SIZE];
@@ -343,7 +343,7 @@ The `kmemcheck` mechanism declares special [tasklet](https://0xax.gitbook.io/lin
static DECLARE_TASKLET(kmemcheck_tasklet, &do_wakeup, 0);
```
-which runs the `do_wakeup` function from the [arch/x86/mm/kmemcheck/error.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/mm/kmemcheck/error.c) source code file when it will be scheduled to run.
+which runs the `do_wakeup` function from the [arch/x86/mm/kmemcheck/error.c](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/mm/kmemcheck/error.c) source code file when it is scheduled to run.
The `do_wakeup` function will call the `kmemcheck_error_recall` function which will print errors collected by `kmemcheck`. As we already saw the:
@@ -410,7 +410,7 @@ That's all.
Conclusion
--------------------------------------------------------------------------------
-This is the end of the third part about linux kernel [memory management](https://en.wikipedia.org/wiki/Memory_management). If you have questions or suggestions, ping me on twitter [0xAX](https://twitter.com/0xAX), drop me an [email](mailto:anotherworldofworld@gmail.com) or just create an [issue](https://github.com/0xAX/linux-insides/issues/new). In the next part we will see yet another memory debugging related tool - `kmemleak`.
+This is the end of the third part about Linux kernel [memory management](https://en.wikipedia.org/wiki/Memory_management). If you have questions or suggestions, ping me on twitter [0xAX](https://twitter.com/0xAX), drop me an [email](mailto:anotherworldofworld@gmail.com) or just create an [issue](https://github.com/0xAX/linux-insides/issues/new). In the next part we will see yet another memory debugging related tool - `kmemleak`.
**Please note that English is not my first language and I am really sorry for any inconvenience. If you found any mistakes please send me a PR to [linux-insides](https://github.com/0xAX/linux-insides).**
diff --git a/Makefile b/Makefile
new file mode 100644
index 00000000..6777b2f1
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,79 @@
+### HELP
+
+.PHONY: help
+help: ## Print help
+ @egrep "(^### |^\S+:.*##\s)" Makefile | sed 's/^###\s*//' | sed 's/^\(\S*\)\:.*##\s*\(.*\)/ \1 - \2/'
+
+### DOCKER
+
+.PHONY: run
+run: image ## docker run ...
+ (docker stop linux-insides-book 2>&1) > /dev/null || true
+ docker run --detach -p 4000:4000 --name linux-insides-book --hostname linux-insides-book linux-insides-book
+
+.PHONY: start
+start: ## start the docker container ...
+ docker start linux-insides-book
+
+.PHONY: image
+image: ## docker image build ...
+ docker image build --rm --squash --label linux-insides --tag linux-insides-book:latest -f Dockerfile . 2> /dev/null || \
+ docker image build --rm --label linux-insides --tag linux-insides-book:latest -f Dockerfile .
+
+.PHONY: sh
+sh: ## run interactive shell inside an already running docker container ...
+ docker exec -it linux-insides-book bash
+
+.PHONY: rm
+rm: ## remove the docker container ...
+ (docker stop linux-insides-book 2>&1) > /dev/null || true
+ (docker rm linux-insides-book 2>&1) > /dev/null || true
+
+.PHONY: logs
+logs: ## gather logs from the docker container ...
+ docker logs linux-insides-book
+
+.PHONY: export
+export: ## run e-book generation inside an already running docker container ...
+ docker exec linux-insides-book /bin/bash -c ' \
+ find . -type f -name '*.svg' -a ! \( -path "./.github/*" -o -path "./_book/*" \) -print0 | while IFS= read -r -d "" svg_file; do \
+ output_file="$${svg_file%.svg}.png"; \
+ chapter_dir=$$(dirname $$(dirname "$$svg_file")); \
+ svg_relative_path="$${svg_file#$$chapter_dir/}"; \
+ output_relative_path="$${output_file#$$chapter_dir/}"; \
+ inkscape --export-png="$$output_file" \
+ --export-area-page \
+ --export-dpi=150 \
+ "$$svg_file"; \
+ find "$$chapter_dir" -maxdepth 1 -type f -name "*.md" -print0 | xargs -0 sed -i "s|\\([/ \\t\\(]\\)$${svg_relative_path}|\\1$${output_relative_path}|g"; \
+ done; \
+ gitbook epub; \
+ gitbook mobi; \
+ gitbook pdf; \
+ mv book.pdf book-A4.pdf; \
+ mv book-A5.json book.json; \
+ gitbook pdf; \
+ mv book.pdf book-A5.pdf; \
+ mv book-A4.pdf book.pdf'
+
+.PHONY: cp
+cp: ## copy all exported e-book formats to current working directory ...
+ docker cp linux-insides-book:/srv/gitbook/book.epub "Linux Inside - 0xAX.epub"
+ docker cp linux-insides-book:/srv/gitbook/book.mobi "Linux Inside - 0xAX.mobi"
+ docker cp linux-insides-book:/srv/gitbook/book.pdf "Linux Inside - 0xAX.pdf"
+ docker cp linux-insides-book:/srv/gitbook/book-A5.pdf "Linux Inside - 0xAX (A5).pdf"
+
+.PHONY: clean
+clean: ## remove all exported e-book files ...
+ rm "Linux Inside - 0xAX.epub" \
+ "Linux Inside - 0xAX.mobi" \
+ "Linux Inside - 0xAX.pdf" \
+ "Linux Inside - 0xAX (A5).pdf"
+
+### LAUNCH BROWSER
+
+.PHONY: browse
+browse: ## Launch broweser
+ @timeout 60 sh -c 'until nc -z 127.0.0.1 4000; do sleep 1; done' || true
+ @(uname | grep Darwin > /dev/null) && open http://127.0.0.1:4000 || true
+ @(uname | grep Linux > /dev/null) && xdg-open http://127.0.0.1:4000 || true
diff --git a/Misc/images/dgap_menu.png b/Misc/images/dgap_menu.png
index 27e23c5c..9eefa072 100644
Binary files a/Misc/images/dgap_menu.png and b/Misc/images/dgap_menu.png differ
diff --git a/Misc/images/git_diff.png b/Misc/images/git_diff.png
index 30cc493c..738a362e 100644
Binary files a/Misc/images/git_diff.png and b/Misc/images/git_diff.png differ
diff --git a/Misc/images/github.png b/Misc/images/github.png
index d2da8950..f0591c0d 100644
Binary files a/Misc/images/github.png and b/Misc/images/github.png differ
diff --git a/Misc/images/google_linux.png b/Misc/images/google_linux.png
index c9eeeab8..9123bde4 100644
Binary files a/Misc/images/google_linux.png and b/Misc/images/google_linux.png differ
diff --git a/Misc/images/menuconfig.png b/Misc/images/menuconfig.png
index 97542825..c86f25c8 100644
Binary files a/Misc/images/menuconfig.png and b/Misc/images/menuconfig.png differ
diff --git a/Misc/images/nconfig.png b/Misc/images/nconfig.png
index 5a142396..57908953 100644
Binary files a/Misc/images/nconfig.png and b/Misc/images/nconfig.png differ
diff --git a/Misc/images/qemu.png b/Misc/images/qemu.png
index 2599f78e..54218b6b 100644
Binary files a/Misc/images/qemu.png and b/Misc/images/qemu.png differ
diff --git a/Misc/linux-misc-1.md b/Misc/linux-misc-1.md
index ad417d0d..1cc1b57b 100644
--- a/Misc/linux-misc-1.md
+++ b/Misc/linux-misc-1.md
@@ -165,7 +165,7 @@ As result of compilation we can see the compressed kernel - `arch/x86/boot/bzIma
Installing Linux kernel
--------------------------------------------------------------------------------
-As I already wrote we will consider two ways how to launch new kernel: In the first case we can install and run the new version of the Linux kernel on the real hardware and the second is launch the Linux kernel on a virtual machine. In the previous paragraph we saw how to build the Linux kernel from source code and as a result we have got compressed image:
+As I already wrote we will consider two ways to launch new kernel: in the first case we can install and run the new version of the Linux kernel on the real hardware and the second is launch the Linux kernel on a virtual machine. In the previous paragraph we saw how to build the Linux kernel from source code and as a result we have got compressed image:
```
...
@@ -224,7 +224,7 @@ $ make -j4
`busybox` is an executable file - `/bin/busybox` that contains a set of standard tools like [coreutils](https://en.wikipedia.org/wiki/GNU_Core_Utilities). In the `busybox` menu we need to enable: `Build BusyBox as a static binary (no shared libs)` option:
-
+
We can find this menu in the:
@@ -270,7 +270,7 @@ $ find . -print0 | cpio --null -ov --format=newc | gzip -9 > ~/dev/initrd_x86_64
We can now run our kernel in the virtual machine. As I already wrote I prefer [qemu](https://en.wikipedia.org/wiki/QEMU) for this. We can run our kernel with the following command:
```
-$ qemu-system-x86_64 -snapshot -m 8GB -serial stdio -kernel ~/dev/linux/arch/x86_64/boot/bzImage -initrd ~/dev/initrd_x86_64.gz -append "root=/dev/sda1 ignore_loglevel"
+$ qemu-system-x86_64 -snapshot -m 8G -serial stdio -kernel ~/dev/linux/arch/x86_64/boot/bzImage -initrd ~/dev/initrd_x86_64.gz -append "root=/dev/sda1 ignore_loglevel"
```

@@ -282,7 +282,7 @@ Consider using [ivandaviov/minimal](https://github.com/ivandavidov/minimal) or [
Getting started with the Linux Kernel Development
---------------------------------------------------------------------------------
-The main point of this paragraph is to answer two questions: What to do and what not to do before sending your first patch to the Linux kernel. Please, do not confuse this `to do` with `todo`. I have no answer what you can fix in the Linux kernel. I just want to tell you my workflow during experimenting with the Linux kernel source code.
+The main point of this paragraph is to answer two questions: what to do and what not to do before sending your first patch to the Linux kernel. Please, do not confuse this `to do` with `todo`. I have no answer what you can fix in the Linux kernel. I just want to tell you my workflow during experimenting with the Linux kernel source code.
First of all I pull the latest updates from Linus's repo with the following commands:
@@ -291,7 +291,7 @@ $ git checkout master
$ git pull upstream master
```
-As soon as your local copy of the linux kernel source code is in sync with the [mainline](https://github.com/torvalds/linux) repository, we can start to apply changes to it. I already wrote, I have no advice for where you should start and what `TODO` to choose within the Linux kernel. But the best place for newbies is the `staging` tree. In other words the set of drivers from the [drivers/staging](https://github.com/torvalds/linux/tree/master/drivers/staging) directory. The maintainer of this tree is [Greg Kroah-Hartman](https://en.wikipedia.org/wiki/Greg_Kroah-Hartman) and the `staging` drivers are a good target for trivial patch fixes. Let's look at this simple example, that describes how to generate a patch, check it and send it to the [Linux kernel mail listing](https://lkml.org/).
+As soon as your local copy of the Linux kernel source code is in sync with the [mainline](https://github.com/torvalds/linux) repository, we can start to apply changes to it. I already wrote, I have no advice for where you should start and what `TODO` to choose within the Linux kernel. But the best place for newbies is the `staging` tree. In other words the set of drivers from the [drivers/staging](https://github.com/torvalds/linux/tree/master/drivers/staging) directory. The maintainer of this tree is [Greg Kroah-Hartman](https://en.wikipedia.org/wiki/Greg_Kroah-Hartman) and the `staging` drivers are a good target for trivial patch fixes. Let's look at this simple example, that describes how to generate a patch, check it and send it to the [Linux kernel mail listing](https://lkml.org/).
If we look in the driver for the [Digi International EPCA PCI](https://github.com/torvalds/linux/tree/master/drivers/staging/dgap) based devices, we will see the `dgap_sindex` function on line 295:
diff --git a/Misc/linux-misc-2.md b/Misc/linux-misc-2.md
index c03fb07c..a1669423 100644
--- a/Misc/linux-misc-2.md
+++ b/Misc/linux-misc-2.md
@@ -110,7 +110,7 @@ The `C` option tells the `makefile` that we need to check all `c` source code wi
ifeq ($(KBUILD_SRC),)
srctree := .
endif
-
+
objtree := .
src := $(srctree)
obj := $(objtree)
@@ -250,7 +250,7 @@ all: vmlinux
Don't worry that we have missed many lines in Makefile that are between `export RCS_FIND_IGNORE.....` and `all: vmlinux.....`. This part of the makefile is responsible for the `make *.config` targets and as I wrote in the beginning of this part we will see only building of the kernel in a general way.
-The `all:` target is the default when no target is given on the command line. You can see here that we include architecture specific makefile there (in our case it will be [arch/x86/Makefile](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/Makefile)). From this moment we will continue from this makefile. As we can see `all` target depends on the `vmlinux` target that defined a little lower in the top makefile:
+The `all:` target is the default when no target is given on the command line. You can see here that we include architecture specific makefile there (in our case it will be [arch/x86/Makefile](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/Makefile)). From this moment we will continue from this makefile. As we can see `all` target depends on the `vmlinux` target that is defined a little lower in the top makefile:
```Makefile
vmlinux: scripts/link-vmlinux.sh $(vmlinux-deps) FORCE
@@ -302,7 +302,7 @@ prepare1: prepare2 $(version_h) include/generated/utsrelease.h \
prepare2: prepare3 outputmakefile asm-generic
```
-The first `prepare0` expands to the `archprepare` that expands to the `archheaders` and `archscripts` that defined in the `x86_64` specific [Makefile](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/Makefile). Let's look on it. The `x86_64` specific makefile starts from the definition of the variables that are related to the architecture-specific configs ([defconfig](https://github.com/torvalds/linux/tree/master/arch/x86/configs), etc...). After this it defines flags for the compiling of the [16-bit](https://en.wikipedia.org/wiki/Real_mode) code, calculating of the `BITS` variable that can be `32` for `i386` or `64` for the `x86_64` flags for the assembly source code, flags for the linker and many many more (all definitions you can find in the [arch/x86/Makefile](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/Makefile)). The first target is `archheaders` in the makefile generates syscall table:
+The first `prepare0` expands to the `archprepare` that expands to the `archheaders` and `archscripts` that defined in the `x86_64` specific [Makefile](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/Makefile). Let's look on it. The `x86_64` specific makefile starts from the definition of the variables that are related to the architecture-specific configs ([defconfig](https://github.com/torvalds/linux/tree/master/arch/x86/configs), etc...). After this it defines flags for the compiling of the [16-bit](https://en.wikipedia.org/wiki/Real_mode) code, calculating of the `BITS` variable that can be `32` for `i386` or `64` for the `x86_64` flags for the assembly source code, flags for the linker and many many more (all definitions you can find in the [arch/x86/Makefile](https://github.com/torvalds/linux/blob/16f73eb02d7e1765ccab3d2018e0bd98eb93d973/arch/x86/Makefile)). The first target is `archheaders` in the makefile and it generates syscall table:
```Makefile
archheaders:
@@ -425,7 +425,7 @@ $(vmlinux-dirs): prepare scripts
$(Q)$(MAKE) $(build)=$@
```
-The `$@` represents `vmlinux-dirs` here that means that it will go recursively over all directories from the `vmlinux-dirs` and its internal directories (depens on configuration) and will execute `make` in there. We can see it in the output:
+The `$@` represents `vmlinux-dirs` here that means that it will go recursively over all directories from the `vmlinux-dirs` and its internal directories (depends on configuration) and will execute `make` in there. We can see it in the output:
```
CC init/main.o
diff --git a/Misc/linux-misc-3.md b/Misc/linux-misc-3.md
index 767005fd..aa897cd3 100644
--- a/Misc/linux-misc-3.md
+++ b/Misc/linux-misc-3.md
@@ -39,7 +39,7 @@ The `lib.c` file contains:
```C
int factorial(int base) {
int res,i = 1;
-
+
if (base == 0) {
return 1;
}
@@ -107,8 +107,8 @@ Disassembly of section .text:
20: b8 00 00 00 00 mov $0x0,%eax
25: e8 00 00 00 00 callq 2a
2a: b8 00 00 00 00 mov $0x0,%eax
- 2f: c9 leaveq
- 30: c3 retq
+ 2f: c9 leaveq
+ 30: c3 retq
```
Here we are interested only in the two `callq` operations. The two `callq` operations contain `linker stubs`, or the function name and offset from it to the next instruction. These stubs will be updated to the real addresses of the functions. We can see these functions' names within the following `objdump` output:
@@ -169,7 +169,7 @@ factorial: file format elf64-x86-64
...
```
-As we can see in the previous output, the address of the `main` function is `0x0000000000400506`. Why it does not start from `0x0`? You may already know that standard C programs are linked with the `glibc` C standard library (assuming the `-nostdlib` was not passed to the `gcc`). The compiled code for a program includes constructor functions to initialize data in the program when the program is started. These functions need to be called before the program is started, or in another words before the `main` function is called. To make the initialization and termination functions work, the compiler must output something in the assembler code to cause those functions to be called at the appropriate time. Execution of this program will start from the code placed in the special `.init` section. We can see this in the beginning of the objdump output:
+As we can see in the previous output, the address of the `main` function is `0x0000000000400506`. Why doesn't it start from `0x0`? You may already know that standard C programs are linked with the `glibc` C standard library (assuming the `-nostdlib` was not passed to the `gcc`). The compiled code for a program includes constructor functions to initialize data in the program when the program is started. These functions need to be called before the program is started, or in another words before the `main` function is called. To make the initialization and termination functions work, the compiler must output something in the assembler code to cause those functions to be called at the appropriate time. Execution of this program will start from the code placed in the special `.init` section. We can see this in the beginning of the objdump output:
```
objdump -S factorial | less
@@ -215,7 +215,7 @@ $ gcc main.c lib.o -o factorial
and after it we will get executable file - `factorial` as a result:
```
-./factorial
+./factorial
factorial of 5 is: 120
```
@@ -312,13 +312,13 @@ $ objdump -S /usr/lib/gcc/x86_64-linux-gnu/4.9/../../../x86_64-linux-gnu/crtn.o
0000000000000000 <.init>:
0: 48 83 c4 08 add $0x8,%rsp
- 4: c3 retq
+ 4: c3 retq
Disassembly of section .fini:
0000000000000000 <.fini>:
0: 48 83 c4 08 add $0x8,%rsp
- 4: c3 retq
+ 4: c3 retq
```
And the `crti.o` object file contains the `_init` and `_fini` symbols. Let's try to link again with these two object files:
@@ -344,14 +344,14 @@ $ ld \
Finally we get an executable file, but if we try to run it, we will get strange results:
```
-$ ./factorial
+$ ./factorial
bash: ./factorial: No such file or directory
```
What's the problem here? Let's look on the executable file with the [readelf](https://sourceware.org/binutils/docs/binutils/readelf.html) util:
```
-$ readelf -l factorial
+$ readelf -l factorial
Elf file type is EXEC (Executable file)
Entry point 0x4003c0
@@ -378,13 +378,13 @@ Program Headers:
Section to Segment mapping:
Segment Sections...
- 00
- 01 .interp
- 02 .interp .note.ABI-tag .hash .dynsym .dynstr .gnu.version .gnu.version_r .rela.dyn .rela.plt .init .plt .text .fini .rodata .eh_frame
- 03 .dynamic .got .got.plt .data
- 04 .dynamic
- 05 .note.ABI-tag
- 06
+ 00
+ 01 .interp
+ 02 .interp .note.ABI-tag .hash .dynsym .dynstr .gnu.version .gnu.version_r .rela.dyn .rela.plt .init .plt .text .fini .rodata .eh_frame
+ 03 .dynamic .got .got.plt .data
+ 04 .dynamic
+ 05 .note.ABI-tag
+ 06
```
Note on the strange line:
@@ -429,7 +429,7 @@ and after this we link object files of our program with the needed system object
Useful command line options of the GNU linker
----------------------------------------------
-As I already wrote and as you can see in the manual of the `GNU linker`, it has big set of the command line options. We've seen a couple of options in this post: `-o