Skip to content

virtio_disk.c

TIP

driver for qemu's virtio disk device. uses qemu's mmio interface to virtio. qemu ... -drive file=fs.img,if=none,format=raw,id=x0 -device virtio-blk-device,drive=x0,bus=virtio-mmio-bus.0

#include "types.h"
#include "riscv.h"
#include "defs.h"
#include "param.h"
#include "memlayout.h"
#include "spinlock.h"
#include "sleeplock.h"
#include "fs.h"
#include "buf.h"
#include "virtio.h"

TIP

the address of virtio mmio register r.

#define R(r) ((volatile uint32 *)(VIRTIO0 + (r)))

static struct disk {

TIP

a set (not a ring) of DMA descriptors, with which the driver tells the device where to read and write individual disk operations. there are NUM descriptors. most commands consist of a "chain" (a linked list) of a couple of these descriptors.

  struct virtq_desc *desc;

TIP

a ring in which the driver writes descriptor numbers that the driver would like the device to process. it only includes the head descriptor of each chain. the ring has NUM elements.

  struct virtq_avail *avail;

TIP

a ring in which the device writes descriptor numbers that the device has finished processing (just the head of each chain). there are NUM used ring entries.

  struct virtq_used *used;

TIP

our own book-keeping.

  char free[NUM];
  uint16 used_idx;

TIP

track info about in-flight operations, for use when completion interrupt arrives. indexed by first descriptor index of chain.

  struct {
    struct buf *b;
    char status;
  } info[NUM];

TIP

disk command headers. one-for-one with descriptors, for convenience.

  struct virtio_blk_req ops[NUM];
  
  struct spinlock vdisk_lock;
  
} disk;

void
virtio_disk_init(void)
{
  uint32 status = 0;

  initlock(&disk.vdisk_lock, "virtio_disk");

  if(*R(VIRTIO_MMIO_MAGIC_VALUE) != 0x74726976 ||
     *R(VIRTIO_MMIO_VERSION) != 2 ||
     *R(VIRTIO_MMIO_DEVICE_ID) != 2 ||
     *R(VIRTIO_MMIO_VENDOR_ID) != 0x554d4551){
    panic("could not find virtio disk");
  }
  

TIP

reset device

  *R(VIRTIO_MMIO_STATUS) = status;

TIP

set ACKNOWLEDGE status bit

  status |= VIRTIO_CONFIG_S_ACKNOWLEDGE;
  *R(VIRTIO_MMIO_STATUS) = status;

TIP

set DRIVER status bit

  status |= VIRTIO_CONFIG_S_DRIVER;
  *R(VIRTIO_MMIO_STATUS) = status;

TIP

negotiate features

  uint64 features = *R(VIRTIO_MMIO_DEVICE_FEATURES);
  features &= ~(1 << VIRTIO_BLK_F_RO);
  features &= ~(1 << VIRTIO_BLK_F_SCSI);
  features &= ~(1 << VIRTIO_BLK_F_CONFIG_WCE);
  features &= ~(1 << VIRTIO_BLK_F_MQ);
  features &= ~(1 << VIRTIO_F_ANY_LAYOUT);
  features &= ~(1 << VIRTIO_RING_F_EVENT_IDX);
  features &= ~(1 << VIRTIO_RING_F_INDIRECT_DESC);
  *R(VIRTIO_MMIO_DRIVER_FEATURES) = features;

TIP

tell device that feature negotiation is complete.

  status |= VIRTIO_CONFIG_S_FEATURES_OK;
  *R(VIRTIO_MMIO_STATUS) = status;

TIP

re-read status to ensure FEATURES_OK is set.

  status = *R(VIRTIO_MMIO_STATUS);
  if(!(status & VIRTIO_CONFIG_S_FEATURES_OK))
    panic("virtio disk FEATURES_OK unset");

TIP

initialize queue 0.

  *R(VIRTIO_MMIO_QUEUE_SEL) = 0;

TIP

ensure queue 0 is not in use.

  if(*R(VIRTIO_MMIO_QUEUE_READY))
    panic("virtio disk should not be ready");

TIP

check maximum queue size.

  uint32 max = *R(VIRTIO_MMIO_QUEUE_NUM_MAX);
  if(max == 0)
    panic("virtio disk has no queue 0");
  if(max < NUM)
    panic("virtio disk max queue too short");

TIP

allocate and zero queue memory.

  disk.desc = kalloc();
  disk.avail = kalloc();
  disk.used = kalloc();
  if(!disk.desc || !disk.avail || !disk.used)
    panic("virtio disk kalloc");
  memset(disk.desc, 0, PGSIZE);
  memset(disk.avail, 0, PGSIZE);
  memset(disk.used, 0, PGSIZE);

TIP

set queue size.

  *R(VIRTIO_MMIO_QUEUE_NUM) = NUM;

TIP

write physical addresses.

  *R(VIRTIO_MMIO_QUEUE_DESC_LOW) = (uint64)disk.desc;
  *R(VIRTIO_MMIO_QUEUE_DESC_HIGH) = (uint64)disk.desc >> 32;
  *R(VIRTIO_MMIO_DRIVER_DESC_LOW) = (uint64)disk.avail;
  *R(VIRTIO_MMIO_DRIVER_DESC_HIGH) = (uint64)disk.avail >> 32;
  *R(VIRTIO_MMIO_DEVICE_DESC_LOW) = (uint64)disk.used;
  *R(VIRTIO_MMIO_DEVICE_DESC_HIGH) = (uint64)disk.used >> 32;

TIP

queue is ready.

  *R(VIRTIO_MMIO_QUEUE_READY) = 0x1;

TIP

all NUM descriptors start out unused.

  for(int i = 0; i < NUM; i++)
    disk.free[i] = 1;

TIP

tell device we're completely ready.

  status |= VIRTIO_CONFIG_S_DRIVER_OK;
  *R(VIRTIO_MMIO_STATUS) = status;

TIP

plic.c and trap.c arrange for interrupts from VIRTIO0_IRQ.

}

TIP

find a free descriptor, mark it non-free, return its index.

static int
alloc_desc()
{
  for(int i = 0; i < NUM; i++){
    if(disk.free[i]){
      disk.free[i] = 0;
      return i;
    }
  }
  return -1;
}

TIP

mark a descriptor as free.

static void
free_desc(int i)
{
  if(i >= NUM)
    panic("free_desc 1");
  if(disk.free[i])
    panic("free_desc 2");
  disk.desc[i].addr = 0;
  disk.desc[i].len = 0;
  disk.desc[i].flags = 0;
  disk.desc[i].next = 0;
  disk.free[i] = 1;
  wakeup(&disk.free[0]);
}

TIP

free a chain of descriptors.

static void
free_chain(int i)
{
  while(1){
    int flag = disk.desc[i].flags;
    int nxt = disk.desc[i].next;
    free_desc(i);
    if(flag & VRING_DESC_F_NEXT)
      i = nxt;
    else
      break;
  }
}

TIP

allocate three descriptors (they need not be contiguous). disk transfers always use three descriptors.

static int
alloc3_desc(int *idx)
{
  for(int i = 0; i < 3; i++){
    idx[i] = alloc_desc();
    if(idx[i] < 0){
      for(int j = 0; j < i; j++)
        free_desc(idx[j]);
      return -1;
    }
  }
  return 0;
}

void
virtio_disk_rw(struct buf *b, int write)
{
  uint64 sector = b->blockno * (BSIZE / 512);

  acquire(&disk.vdisk_lock);

TIP

the spec's Section 5.2 says that legacy block operations use three descriptors: one for type/reserved/sector, one for the data, one for a 1-byte status result.

TIP

allocate the three descriptors.

  int idx[3];
  while(1){
    if(alloc3_desc(idx) == 0) {
      break;
    }
    sleep(&disk.free[0], &disk.vdisk_lock);
  }

TIP

format the three descriptors. qemu's virtio-blk.c reads them.

  struct virtio_blk_req *buf0 = &disk.ops[idx[0]];

  if(write)
    buf0->type = VIRTIO_BLK_T_OUT;
  else
    buf0->type = VIRTIO_BLK_T_IN;
  buf0->reserved = 0;
  buf0->sector = sector;

  disk.desc[idx[0]].addr = (uint64) buf0;
  disk.desc[idx[0]].len = sizeof(struct virtio_blk_req);
  disk.desc[idx[0]].flags = VRING_DESC_F_NEXT;
  disk.desc[idx[0]].next = idx[1];

  disk.desc[idx[1]].addr = (uint64) b->data;
  disk.desc[idx[1]].len = BSIZE;
  if(write)
    disk.desc[idx[1]].flags = 0;
  else
    disk.desc[idx[1]].flags = VRING_DESC_F_WRITE;
  disk.desc[idx[1]].flags |= VRING_DESC_F_NEXT;
  disk.desc[idx[1]].next = idx[2];

  disk.info[idx[0]].status = 0xff;
  disk.desc[idx[2]].addr = (uint64) &disk.info[idx[0]].status;
  disk.desc[idx[2]].len = 1;
  disk.desc[idx[2]].flags = VRING_DESC_F_WRITE;
  disk.desc[idx[2]].next = 0;

TIP

record struct buf for virtio_disk_intr().

  b->disk = 1;
  disk.info[idx[0]].b = b;

TIP

tell the device the first index in our chain of descriptors.

  disk.avail->ring[disk.avail->idx % NUM] = idx[0];

  __sync_synchronize();

TIP

tell the device another avail ring entry is available.

  disk.avail->idx += 1;

  __sync_synchronize();

  *R(VIRTIO_MMIO_QUEUE_NOTIFY) = 0;

TIP

Wait for virtio_disk_intr() to say request has finished.

  while(b->disk == 1) {
    sleep(b, &disk.vdisk_lock);
  }

  disk.info[idx[0]].b = 0;
  free_chain(idx[0]);

  release(&disk.vdisk_lock);
}

void
virtio_disk_intr()
{
  acquire(&disk.vdisk_lock);

TIP

the device won't raise another interrupt until we tell it we've seen this interrupt, which the following line does. this may race with the device writing new entries to the "used" ring, in which case we may process the new completion entries in this interrupt, and have nothing to do in the next interrupt, which is harmless.

  *R(VIRTIO_MMIO_INTERRUPT_ACK) = *R(VIRTIO_MMIO_INTERRUPT_STATUS) & 0x3;

  __sync_synchronize();

TIP

the device increments disk.used->idx when it adds an entry to the used ring.

  while(disk.used_idx != disk.used->idx){
    __sync_synchronize();
    int id = disk.used->ring[disk.used_idx % NUM].id;

    if(disk.info[id].status != 0)
      panic("virtio_disk_intr status");

    struct buf *b = disk.info[id].b;
    b->disk = 0;
    wakeup(b);

    disk.used_idx += 1;
  }

  release(&disk.vdisk_lock);
}