The call graph for this function is shown in Figure 11.2. This is the high level API function for searching the swap areas for a free swap lot and returning the resulting swp_entry_t.
 99 swp_entry_t get_swap_page(void)
100 {
101     struct swap_info_struct * p;
102     unsigned long offset;
103     swp_entry_t entry;
104     int type, wrapped = 0;
105 
106     entry.val = 0;  /* Out of memory */
107     swap_list_lock();
108     type = swap_list.next;
109     if (type < 0)
110         goto out;
111     if (nr_swap_pages <= 0)
112         goto out;
113 
114     while (1) {
115         p = &swap_info[type];
116         if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) {
117             swap_device_lock(p);
118             offset = scan_swap_map(p);
119             swap_device_unlock(p);
120             if (offset) {
121                 entry = SWP_ENTRY(type,offset);
122                 type = swap_info[type].next;
123                 if (type < 0 ||
124                     p->prio != swap_info[type].prio) {
125                       swap_list.next = swap_list.head;
126                 } else {
127                     swap_list.next = type;
128                 }
129                 goto out;
130             }
131         }
132         type = p->next;
133         if (!wrapped) {
134             if (type < 0 || p->prio != swap_info[type].prio) {
135                 type = swap_list.head;
136                 wrapped = 1;
137             }
138         } else
139             if (type < 0)
140                 goto out;     /* out of swap space */
141     }
142 out:
143     swap_list_unlock();
144     return entry;
145 }
This function tries to allocate SWAPFILE_CLUSTER number of pages sequentially in swap. When it has allocated that many, it searches for another block of free slots of size SWAPFILE_CLUSTER. If it fails to find one, it resorts to allocating the first free slot. This clustering attempts to make sure that slots are allocated and freed in SWAPFILE_CLUSTER sized chunks.
 36 static inline int scan_swap_map(struct swap_info_struct *si)
 37 {
 38     unsigned long offset;
 47     if (si->cluster_nr) {
 48         while (si->cluster_next <= si->highest_bit) {
 49             offset = si->cluster_next++;
 50             if (si->swap_map[offset])
 51                 continue;
 52             si->cluster_nr--;
 53             goto got_page;
 54         }
 55     }
Allocate SWAPFILE_CLUSTER pages sequentially. cluster_nr is initialised to SWAPFILE_CLUTER and decrements with each allocation
 56     si->cluster_nr = SWAPFILE_CLUSTER;
 57 
 58     /* try to find an empty (even not aligned) cluster. */
 59     offset = si->lowest_bit;
 60  check_next_cluster:
 61     if (offset+SWAPFILE_CLUSTER-1 <= si->highest_bit)
 62     {
 63         int nr;
 64         for (nr = offset; nr < offset+SWAPFILE_CLUSTER; nr++)
 65             if (si->swap_map[nr])
 66             {
 67                 offset = nr+1;
 68                 goto check_next_cluster;
 69             }
 70         /* We found a completly empty cluster, so start
 71          * using it.
 72          */
 73         goto got_page;
 74     }
At this stage, SWAPFILE_CLUSTER pages have been allocated sequentially so find the next free block of SWAPFILE_CLUSTER pages.
 75     /* No luck, so now go finegrined as usual. -Andrea */
 76     for (offset = si->lowest_bit; offset <= si->highest_bit ;
                                offset++) {
 77         if (si->swap_map[offset])
 78             continue;
 79         si->lowest_bit = offset+1;
This unusual for loop extract starts scanning for a free page starting from lowest_bit
 80     got_page:
 81         if (offset == si->lowest_bit)
 82             si->lowest_bit++;
 83         if (offset == si->highest_bit)
 84             si->highest_bit--;
 85         if (si->lowest_bit > si->highest_bit) {
 86             si->lowest_bit = si->max;
 87             si->highest_bit = 0;
 88         }
 89         si->swap_map[offset] = 1;
 90         nr_swap_pages--;
 91         si->cluster_next = offset+1;
 92         return offset;
 93     }
 94     si->lowest_bit = si->max;
 95     si->highest_bit = 0;
 96     return 0;
 97 }
A slot has been found, do some housekeeping and return it
The call graph for this function is shown in Figure 11.3. This function wraps around the normal page cache handler. It first checks if the page is already in the swap cache with swap_duplicate() and if it does not, it calls add_to_page_cache_unique() instead.
 70 int add_to_swap_cache(struct page *page, swp_entry_t entry)
 71 {
 72     if (page->mapping)
 73         BUG();
 74     if (!swap_duplicate(entry)) {
 75         INC_CACHE_INFO(noent_race);
 76         return -ENOENT;
 77     }
 78     if (add_to_page_cache_unique(page, &swapper_space, entry.val,
 79             page_hash(&swapper_space, entry.val)) != 0) {
 80         swap_free(entry);
 81         INC_CACHE_INFO(exist_race);
 82         return -EEXIST;
 83     }
 84     if (!PageLocked(page))
 85         BUG();
 86     if (!PageSwapCache(page))
 87         BUG();
 88     INC_CACHE_INFO(add_total);
 89     return 0;
 90 }
This function verifies a swap entry is valid and if so, increments its swap map count.
1161 int swap_duplicate(swp_entry_t entry)
1162 {
1163     struct swap_info_struct * p;
1164     unsigned long offset, type;
1165     int result = 0;
1166 
1167     type = SWP_TYPE(entry);
1168     if (type >= nr_swapfiles)
1169         goto bad_file;
1170     p = type + swap_info;
1171     offset = SWP_OFFSET(entry);
1172 
1173     swap_device_lock(p);
1174     if (offset < p->max && p->swap_map[offset]) {
1175         if (p->swap_map[offset] < SWAP_MAP_MAX - 1) {
1176             p->swap_map[offset]++;
1177             result = 1;
1178         } else if (p->swap_map[offset] <= SWAP_MAP_MAX) {
1179             if (swap_overflow++ < 5)
1180                 printk(KERN_WARNING "swap_dup: swap entry
                              overflow\n");
1181             p->swap_map[offset] = SWAP_MAP_MAX;
1182             result = 1;
1183         }
1184     }
1185     swap_device_unlock(p);
1186 out:
1187     return result;
1188 
1189 bad_file:
1190     printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
1191     goto out;
1192 }
Decrements the corresponding swap_map entry for the swp_entry_t
214 void swap_free(swp_entry_t entry)
215 {
216     struct swap_info_struct * p;
217 
218     p = swap_info_get(entry);
219     if (p) {
220         swap_entry_free(p, SWP_OFFSET(entry));
221         swap_info_put(p);
222     }
223 }
192 static int swap_entry_free(struct swap_info_struct *p, 
                 unsigned long offset)
193 {
194     int count = p->swap_map[offset];
195 
196     if (count < SWAP_MAP_MAX) {
197         count--;
198         p->swap_map[offset] = count;
199         if (!count) {
200             if (offset < p->lowest_bit)
201                 p->lowest_bit = offset;
202             if (offset > p->highest_bit)
203                 p->highest_bit = offset;
204             nr_swap_pages++;
205         }
206     }
207     return count;
208 }
This function finds the swap_info_struct for the given entry, performs some basic checking and then locks the device.
147 static struct swap_info_struct * swap_info_get(swp_entry_t entry)
148 {
149     struct swap_info_struct * p;
150     unsigned long offset, type;
151 
152     if (!entry.val)
153         goto out;
154     type = SWP_TYPE(entry);
155     if (type >= nr_swapfiles)
156         goto bad_nofile;
157     p = & swap_info[type];
158     if (!(p->flags & SWP_USED))
159         goto bad_device;
160     offset = SWP_OFFSET(entry);
161     if (offset >= p->max)
162         goto bad_offset;
163     if (!p->swap_map[offset])
164         goto bad_free;
165     swap_list_lock();
166     if (p->prio > swap_info[swap_list.next].prio)
167         swap_list.next = type;
168     swap_device_lock(p);
169     return p;
170 
171 bad_free:
172     printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, 
                                                entry.val);
173     goto out;
174 bad_offset:
175     printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, 
                                                entry.val);
176     goto out;
177 bad_device:
178     printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, 
                                                entry.val);
179     goto out;
180 bad_nofile:
181     printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, 
                                                entry.val);
182 out:
183     return NULL;
184 } 
This function simply unlocks the area and list
186 static void swap_info_put(struct swap_info_struct * p)
187 {
188     swap_device_unlock(p);
189     swap_list_unlock();
190 }
Top level function for finding a page in the swap cache
161 struct page * lookup_swap_cache(swp_entry_t entry)
162 {
163     struct page *found;
164 
165     found = find_get_page(&swapper_space, entry.val);
166     /*
167      * Unsafe to assert PageSwapCache and mapping on page found:
168      * if SMP nothing prevents swapoff from deleting this page from
169      * the swap cache at this moment.  find_lock_page would prevent
170      * that, but no need to change: we _have_ got the right page.
171      */
172     INC_CACHE_INFO(find_total);
173     if (found)
174         INC_CACHE_INFO(find_success);
175     return found;
176 }
This function will either return the requsted page from the swap cache. If it does not exist, a page will be allocated, placed in the swap cache and the data is scheduled to be read from disk with rw_swap_page().
184 struct page * read_swap_cache_async(swp_entry_t entry)
185 {
186     struct page *found_page, *new_page = NULL;
187     int err;
188 
189     do {
196         found_page = find_get_page(&swapper_space, entry.val);
197         if (found_page)
198             break;
199 
200         /*
201          * Get a new page to read into from swap.
202          */
203         if (!new_page) {
204             new_page = alloc_page(GFP_HIGHUSER);
205             if (!new_page)
206                 break;          /* Out of memory */
207         }
208 
209         /*
210          * Associate the page with swap entry in the swap cache.
211          * May fail (-ENOENT) if swap entry has been freed since
212          * our caller observed it.  May fail (-EEXIST) if there
213          * is already a page associated with this entry in the
214          * swap cache: added by a racing read_swap_cache_async,
215          * or by try_to_swap_out (or shmem_writepage) re-using
216          * the just freed swap entry for an existing page.
217          */
218         err = add_to_swap_cache(new_page, entry);
219         if (!err) {
220             /*
221              * Initiate read into locked page and return.
222              */
223             rw_swap_page(READ, new_page);
224             return new_page;
225         }
226     } while (err != -ENOENT);
227 
228     if (new_page)
229         page_cache_release(new_page);
230     return found_page;
231 }
This is the function registered in swap_aops for writing out pages. It's function is pretty simple. First it calls remove_exclusive_swap_page() to try and free the page. If the page was freed, then the page will be unlocked here before returning as there is no IO pending on the page. Otherwise rw_swap_page() is called to sync the page with backing storage.
 24 static int swap_writepage(struct page *page)
 25 {
 26     if (remove_exclusive_swap_page(page)) {
 27         UnlockPage(page);
 28         return 0;
 29     }
 30     rw_swap_page(WRITE, page);
 31     return 0;
 32 }
This function will tries to work out if there is other processes sharing this page or not. If possible the page will be removed from the swap cache and freed. Once removed from the swap cache, swap_free() is decremented to indicate that the swap cache is no longer using the slot. The count will instead reflect the number of PTEs that contain a swp_entry_t for this slot.
287 int remove_exclusive_swap_page(struct page *page)
288 {
289     int retval;
290     struct swap_info_struct * p;
291     swp_entry_t entry;
292 
293     if (!PageLocked(page))
294         BUG();
295     if (!PageSwapCache(page))
296         return 0;
297     if (page_count(page) - !!page->buffers != 2) /* 2: us + cache */
298         return 0;
299 
300     entry.val = page->index;
301     p = swap_info_get(entry);
302     if (!p)
303         return 0;
304 
305     /* Is the only swap cache user the cache itself? */
306     retval = 0;
307     if (p->swap_map[SWP_OFFSET(entry)] == 1) {
308         /* Recheck the page count with the pagecache lock held.. */
309         spin_lock(&pagecache_lock);
310         if (page_count(page) - !!page->buffers == 2) {
311             __delete_from_swap_cache(page);
312             SetPageDirty(page);
313             retval = 1;
314         }
315         spin_unlock(&pagecache_lock);
316     }
317     swap_info_put(p);
318 
319     if (retval) {
320         block_flushpage(page, 0);
321         swap_free(entry);
322         page_cache_release(page);
323     }
324 
325     return retval;
326 }
This function frees an entry from the swap cache and tries to reclaims the page. Note that this function only applies to the swap cache.
 
332 void free_swap_and_cache(swp_entry_t entry)
333 {
334     struct swap_info_struct * p;
335     struct page *page = NULL;
336 
337     p = swap_info_get(entry);
338     if (p) {
339         if (swap_entry_free(p, SWP_OFFSET(entry)) == 1)
340             page = find_trylock_page(&swapper_space, entry.val);
341         swap_info_put(p);
342     }
343     if (page) {
344         page_cache_get(page);
345         /* Only cache user (+us), or swap space full? Free it! */
346         if (page_count(page) - !!page->buffers == 2 || vm_swap_full()) {
347             delete_from_swap_cache(page);
348             SetPageDirty(page);
349         }
350         UnlockPage(page);
351         page_cache_release(page);
352     }
353 }
This is the main function used for reading data from backing storage into a page or writing data from a page to backing storage. Which operation is performs depends on the first parameter rw. It is basically a wrapper function around the core function rw_swap_page_base(). This simply enforces that the operations are only performed on pages in the swap cache.
 85 void rw_swap_page(int rw, struct page *page)
 86 {
 87     swp_entry_t entry;
 88 
 89     entry.val = page->index;
 90 
 91     if (!PageLocked(page))
 92         PAGE_BUG(page);
 93     if (!PageSwapCache(page))
 94         PAGE_BUG(page);
 95     if (!rw_swap_page_base(rw, entry, page))
 96         UnlockPage(page);
 97 }
This is the core function for reading or writing data to the backing storage. Whether it is writing to a partition or a file, the block layer brw_page() function is used to perform the actual IO. This function sets up the necessary buffer information for the block layer to do it's job. The brw_page() performs asynchronous IO so it is likely it will return with the page locked which will be unlocked when the IO completes.
 36 static int rw_swap_page_base(int rw, swp_entry_t entry, 
                                 struct page *page)
 37 {
 38     unsigned long offset;
 39     int zones[PAGE_SIZE/512];
 40     int zones_used;
 41     kdev_t dev = 0;
 42     int block_size;
 43     struct inode *swapf = 0;
 44 
 45     if (rw == READ) {
 46         ClearPageUptodate(page);
 47         kstat.pswpin++;
 48     } else
 49         kstat.pswpout++;
 50 
 51     get_swaphandle_info(entry, &offset, &dev, &swapf);
 52     if (dev) {
 53         zones[0] = offset;
 54         zones_used = 1;
 55         block_size = PAGE_SIZE;
 56     } else if (swapf) {
 57         int i, j;
 58         unsigned int block = 
 59              offset << (PAGE_SHIFT - swapf->i_sb->s_blocksize_bits);
 60 
 61         block_size = swapf->i_sb->s_blocksize;
 62         for (i=0, j=0; j< PAGE_SIZE ; i++, j += block_size)
 63             if (!(zones[i] = bmap(swapf,block++))) {
 64                 printk("rw_swap_page: bad swap file\n");
 65                 return 0;
 66             }
 67         zones_used = i;
 68         dev = swapf->i_dev;
 69     } else {
 70         return 0;
 71     }
 72 
 73     /* block_size == PAGE_SIZE/zones_used */
 74     brw_page(rw, page, dev, zones, block_size);
 75     return 1;
 76 }
This function is responsible for returning either the kdev_t or struct inode that is managing the swap area that entry belongs to.
1197 void get_swaphandle_info(swp_entry_t entry, unsigned long *offset, 
1198                         kdev_t *dev, struct inode **swapf)
1199 {
1200     unsigned long type;
1201     struct swap_info_struct *p;
1202 
1203     type = SWP_TYPE(entry);
1204     if (type >= nr_swapfiles) {
1205         printk(KERN_ERR "rw_swap_page: %s%08lx\n", Bad_file, 
                                                        entry.val);
1206         return;
1207     }
1208 
1209     p = &swap_info[type];
1210     *offset = SWP_OFFSET(entry);
1211     if (*offset >= p->max && *offset != 0) {
1212         printk(KERN_ERR "rw_swap_page: %s%08lx\n", Bad_offset, 
                                                        entry.val);
1213         return;
1214     }
1215     if (p->swap_map && !p->swap_map[*offset]) {
1216         printk(KERN_ERR "rw_swap_page: %s%08lx\n", Unused_offset, 
                                                        entry.val);
1217         return;
1218     }
1219     if (!(p->flags & SWP_USED)) {
1220         printk(KERN_ERR "rw_swap_page: %s%08lx\n", Unused_file, 
                                                        entry.val);
1221         return;
1222     }
1223 
1224     if (p->swap_device) {
1225         *dev = p->swap_device;
1226     } else if (p->swap_file) {
1227         *swapf = p->swap_file->d_inode;
1228     } else {
1229         printk(KERN_ERR "rw_swap_page: no swap file or device\n");
1230     }
1231     return;
1232 }
This, quite large, function is responsible for the activating of swap space. Broadly speaking the tasks is takes are as follows;
855 asmlinkage long sys_swapon(const char * specialfile, 
                               int swap_flags)
856 {
857       struct swap_info_struct * p;
858       struct nameidata nd;
859       struct inode * swap_inode;
860       unsigned int type;
861       int i, j, prev;
862       int error;
863       static int least_priority = 0;
864       union swap_header *swap_header = 0;
865       int swap_header_version;
866       int nr_good_pages = 0;
867       unsigned long maxpages = 1;
868       int swapfilesize;
869       struct block_device *bdev = NULL;
870       unsigned short *swap_map;
871       
872       if (!capable(CAP_SYS_ADMIN))
873         return -EPERM;
874       lock_kernel();
875       swap_list_lock();
876       p = swap_info;
877       for (type = 0 ; type < nr_swapfiles ; type++,p++)
878         if (!(p->flags & SWP_USED))
879           break;
880       error = -EPERM;
881       if (type >= MAX_SWAPFILES) {
882         swap_list_unlock();
883         goto out;
884       }
885       if (type >= nr_swapfiles)
886         nr_swapfiles = type+1;
887       p->flags = SWP_USED;
888       p->swap_file = NULL;
889       p->swap_vfsmnt = NULL;
890       p->swap_device = 0;
891       p->swap_map = NULL;
892       p->lowest_bit = 0;
893       p->highest_bit = 0;
894       p->cluster_nr = 0;
895       p->sdev_lock = SPIN_LOCK_UNLOCKED;
896       p->next = -1;
897       if (swap_flags & SWAP_FLAG_PREFER) {
898         p->prio =
899           (swap_flags & SWAP_FLAG_PRIO_MASK)>>SWAP_FLAG_PRIO_SHIFT;
900       } else {
901         p->prio = --least_priority;
902       }
903       swap_list_unlock();
Find a free swap_info_struct and initialise it with default values
904 error = user_path_walk(specialfile, &nd); 905 if (error) 906 goto bad_swap_2; 907 908 p->swap_file = nd.dentry; 909 p->swap_vfsmnt = nd.mnt; 910 swap_inode = nd.dentry->d_inode; 911 error = -EINVAL; 912
Traverse the VFS and get some information about the special file
913       if (S_ISBLK(swap_inode->i_mode)) {
914         kdev_t dev = swap_inode->i_rdev;
915         struct block_device_operations *bdops;
916         devfs_handle_t de;
917 
918         p->swap_device = dev;
919         set_blocksize(dev, PAGE_SIZE);
920         
921         bd_acquire(swap_inode);
922         bdev = swap_inode->i_bdev;
923         de = devfs_get_handle_from_inode(swap_inode);
924         bdops = devfs_get_ops(de);
925         if (bdops) bdev->bd_op = bdops;
926 
927         error = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0,
                   BDEV_SWAP);
928         devfs_put_ops(de);/* Decrement module use count 
                               * now we're safe*/
929         if (error)
930           goto bad_swap_2;
931         set_blocksize(dev, PAGE_SIZE);
932         error = -ENODEV;
933         if (!dev || (blk_size[MAJOR(dev)] &&
934          !blk_size[MAJOR(dev)][MINOR(dev)]))
935           goto bad_swap;
936         swapfilesize = 0;
937         if (blk_size[MAJOR(dev)])
938           swapfilesize = blk_size[MAJOR(dev)][MINOR(dev)]
939             >> (PAGE_SHIFT - 10);
940       } else if (S_ISREG(swap_inode->i_mode))
941         swapfilesize = swap_inode->i_size >> PAGE_SHIFT;
942       else
943         goto bad_swap;
If a partition, configure the block device before calculating the size of the area, else obtain it from the inode for the file.
945       error = -EBUSY;
946       for (i = 0 ; i < nr_swapfiles ; i++) {
947         struct swap_info_struct *q = &swap_info[i];
948         if (i == type || !q->swap_file)
949           continue;
950         if (swap_inode->i_mapping ==
             q->swap_file->d_inode->i_mapping)
951           goto bad_swap;
952       }
953 
954       swap_header = (void *) __get_free_page(GFP_USER);
955       if (!swap_header) {
956         printk("Unable to start swapping: out of memory :-)\n");
957         error = -ENOMEM;
958         goto bad_swap;
959       }
960 
961       lock_page(virt_to_page(swap_header));
962       rw_swap_page_nolock(READ, SWP_ENTRY(type,0), 
            (char *) swap_header);
963 
964       if (!memcmp("SWAP-SPACE",swap_header->magic.magic,10))
965         swap_header_version = 1;
966       else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10))
967         swap_header_version = 2;
968       else {
969         printk("Unable to find swap-space signature\n");
970         error = -EINVAL;
971         goto bad_swap;
972       }
974       switch (swap_header_version) {
975       case 1:
976         memset(((char *) swap_header)+PAGE_SIZE-10,0,10);
977         j = 0;
978         p->lowest_bit = 0;
979         p->highest_bit = 0;
980         for (i = 1 ; i < 8*PAGE_SIZE ; i++) {
981           if (test_bit(i,(char *) swap_header)) {
982             if (!p->lowest_bit)
983                   p->lowest_bit = i;
984             p->highest_bit = i;
985             maxpages = i+1;
986             j++;
987           }
988         }
989         nr_good_pages = j;
990         p->swap_map = vmalloc(maxpages * sizeof(short));
991         if (!p->swap_map) {
992           error = -ENOMEM;        
993           goto bad_swap;
994         }
995         for (i = 1 ; i < maxpages ; i++) {
996           if (test_bit(i,(char *) swap_header))
997             p->swap_map[i] = 0;
998           else
999             p->swap_map[i] = SWAP_MAP_BAD;
1000         }
1001         break;
1002 
Read in the information needed to populate the swap_map when the swap area is version 1.
1003       case 2:
1006         if (swap_header->info.version != 1) {
1007           printk(KERN_WARNING
1008            "Unable to handle swap header version %d\n",
1009            swap_header->info.version);
1010           error = -EINVAL;
1011           goto bad_swap;
1012         }
1013 
1014         p->lowest_bit  = 1;
1015         maxpages = SWP_OFFSET(SWP_ENTRY(0,~0UL)) - 1;
1016         if (maxpages > swap_header->info.last_page)
1017           maxpages = swap_header->info.last_page;
1018         p->highest_bit = maxpages - 1;
1019 
1020         error = -EINVAL;
1021         if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
1022           goto bad_swap;
1023         
1025         if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) {
1026           error = -ENOMEM;
1027           goto bad_swap;
1028         }
1029 
1030         error = 0;
1031         memset(p->swap_map, 0, maxpages * sizeof(short));
1032         for (i=0; i<swap_header->info.nr_badpages; i++) {
1033           int page = swap_header->info.badpages[i];
1034           if (page <= 0 || 
             page >= swap_header->info.last_page)
1035             error = -EINVAL;
1036           else
1037             p->swap_map[page] = SWAP_MAP_BAD;
1038         }
1039         nr_good_pages = swap_header->info.last_page -
1040             swap_header->info.nr_badpages -
1041             1 /* header page */;
1042         if (error) 
1043           goto bad_swap;
1044       }
Read the header information when the file format is version 2
1045       
1046       if (swapfilesize && maxpages > swapfilesize) {
1047         printk(KERN_WARNING
1048          "Swap area shorter than signature indicates\n");
1049         error = -EINVAL;
1050         goto bad_swap;
1051       }
1052       if (!nr_good_pages) {
1053         printk(KERN_WARNING "Empty swap-file\n");
1054         error = -EINVAL;
1055         goto bad_swap;
1056       }
1057       p->swap_map[0] = SWAP_MAP_BAD;
1058       swap_list_lock();
1059       swap_device_lock(p);
1060       p->max = maxpages;
1061       p->flags = SWP_WRITEOK;
1062       p->pages = nr_good_pages;
1063       nr_swap_pages += nr_good_pages;
1064       total_swap_pages += nr_good_pages;
1065       printk(KERN_INFO "Adding Swap: 
                             %dk swap-space (priority %d)\n",
1066        nr_good_pages<<(PAGE_SHIFT-10), p->prio);
1068       /* insert swap space into swap_list: */
1069       prev = -1;
1070       for (i = swap_list.head; i >= 0; i = swap_info[i].next) {
1071         if (p->prio >= swap_info[i].prio) {
1072           break;
1073         }
1074         prev = i;
1075       }
1076       p->next = i;
1077       if (prev < 0) {
1078         swap_list.head = swap_list.next = p - swap_info;
1079       } else {
1080         swap_info[prev].next = p - swap_info;
1081       }
1082       swap_device_unlock(p);
1083       swap_list_unlock();
1084       error = 0;
1085       goto out;
1086 bad_swap: 1087 if (bdev) 1088 blkdev_put(bdev, BDEV_SWAP); 1089 bad_swap_2: 1090 swap_list_lock(); 1091 swap_map = p->swap_map; 1092 nd.mnt = p->swap_vfsmnt; 1093 nd.dentry = p->swap_file; 1094 p->swap_device = 0; 1095 p->swap_file = NULL; 1096 p->swap_vfsmnt = NULL; 1097 p->swap_map = NULL; 1098 p->flags = 0; 1099 if (!(swap_flags & SWAP_FLAG_PREFER)) 1100 ++least_priority; 1101 swap_list_unlock(); 1102 if (swap_map) 1103 vfree(swap_map); 1104 path_release(&nd); 1105 out: 1106 if (swap_header) 1107 free_page((long) swap_header); 1108 unlock_kernel(); 1109 return error; 1110 }
This function is called during the initialisation of kswapd to set the size of page_cluster. This variable determines how many pages readahead from files and from backing storage when paging in data.
100 void __init swap_setup(void)
101 {
102     unsigned long megs = num_physpages >> (20 - PAGE_SHIFT);
103 
104     /* Use a smaller cluster for small-memory machines */
105     if (megs < 16)
106         page_cluster = 2;
107     else
108         page_cluster = 3;
109     /*
110      * Right now other parts of the system means that we
111      * _really_ don't want to cluster much more
112      */
113 }
This function is principally concerned with updating the swap_info_struct and the swap lists. The main task of paging in all pages in the area is the responsibility of try_to_unuse(). The function tasks are broadly
720 asmlinkage long sys_swapoff(const char * specialfile)
721 {
722     struct swap_info_struct * p = NULL;
723     unsigned short *swap_map;
724     struct nameidata nd;
725     int i, type, prev;
726     int err;
727     
728     if (!capable(CAP_SYS_ADMIN))
729         return -EPERM;
730 
731     err = user_path_walk(specialfile, &nd);
732     if (err)
733         goto out;
734 
735     lock_kernel();
736     prev = -1;
737     swap_list_lock();
738     for (type = swap_list.head; type >= 0; 
         type = swap_info[type].next) {
739         p = swap_info + type;
740         if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) {
741             if (p->swap_file == nd.dentry)
742               break;
743         }
744         prev = type;
745     }
746     err = -EINVAL;
747     if (type < 0) {
748         swap_list_unlock();
749         goto out_dput;
750     }
751 
752     if (prev < 0) {
753         swap_list.head = p->next;
754     } else {
755         swap_info[prev].next = p->next;
756     }
757     if (type == swap_list.next) {
758         /* just pick something that's safe... */
759         swap_list.next = swap_list.head;
760     }
761     nr_swap_pages -= p->pages;
762     total_swap_pages -= p->pages;
763     p->flags = SWP_USED;
Acquire the BKL, find the swap_info_struct for the area to be deactivated and remove it from the swap list.
764 swap_list_unlock(); 765 unlock_kernel(); 766 err = try_to_unuse(type);
767     lock_kernel();
768     if (err) {
769         /* re-insert swap space back into swap_list */
770         swap_list_lock();
771         for (prev = -1, i = swap_list.head; 
                 i >= 0; 
                 prev = i, i = swap_info[i].next)
772             if (p->prio >= swap_info[i].prio)
773                 break;
774         p->next = i;
775         if (prev < 0)
776             swap_list.head = swap_list.next = p - swap_info;
777         else
778             swap_info[prev].next = p - swap_info;
779         nr_swap_pages += p->pages;
780         total_swap_pages += p->pages;
781         p->flags = SWP_WRITEOK;
782         swap_list_unlock();
783         goto out_dput;
784     }
Acquire the BKL. If we failed to page in all pages, then reinsert the area into the swap list
785 if (p->swap_device) 786 blkdev_put(p->swap_file->d_inode->i_bdev, BDEV_SWAP); 787 path_release(&nd); 788 789 swap_list_lock(); 790 swap_device_lock(p); 791 nd.mnt = p->swap_vfsmnt; 792 nd.dentry = p->swap_file; 793 p->swap_vfsmnt = NULL; 794 p->swap_file = NULL; 795 p->swap_device = 0; 796 p->max = 0; 797 swap_map = p->swap_map; 798 p->swap_map = NULL; 799 p->flags = 0; 800 swap_device_unlock(p); 801 swap_list_unlock(); 802 vfree(swap_map); 803 err = 0; 804 805 out_dput: 806 unlock_kernel(); 807 path_release(&nd); 808 out: 809 return err; 810 }
Else the swap area was successfully deactivated to close the block device and mark the swap_info_struct free
This function is heavily commented in the source code albeit it consists of speculation or is slightly inaccurate at parts. The comments are omitted here for brevity.
513 static int try_to_unuse(unsigned int type)
514 {
515     struct swap_info_struct * si = &swap_info[type];
516     struct mm_struct *start_mm;
517     unsigned short *swap_map;
518     unsigned short swcount;
519     struct page *page;
520     swp_entry_t entry;
521     int i = 0;
522     int retval = 0;
523     int reset_overflow = 0;
525 
540     start_mm = &init_mm;
541     atomic_inc(&init_mm.mm_users);
542 
556     while ((i = find_next_to_unuse(si, i))) {
557         /* 
558          * Get a page for the entry, using the existing swap
559          * cache page if there is one.  Otherwise, get a clean
560          * page and read the swap into it. 
561          */
562         swap_map = &si->swap_map[i];
563         entry = SWP_ENTRY(type, i);
564         page = read_swap_cache_async(entry);
565         if (!page) {
572             if (!*swap_map)
573                 continue;
574             retval = -ENOMEM;
575             break;
576         }
577 
578         /*
579          * Don't hold on to start_mm if it looks like exiting.
580          */
581         if (atomic_read(&start_mm->mm_users) == 1) {
582             mmput(start_mm);
583             start_mm = &init_mm;
584             atomic_inc(&init_mm.mm_users);
585         }
587         /*
588          * Wait for and lock page.  When do_swap_page races with
589          * try_to_unuse, do_swap_page can handle the fault much
590          * faster than try_to_unuse can locate the entry.  This
591          * apparently redundant "wait_on_page" lets try_to_unuse
592          * defer to do_swap_page in such a case - in some tests,
593          * do_swap_page and try_to_unuse repeatedly compete.
594          */
595         wait_on_page(page);
596         lock_page(page);
597 
598         /*
599          * Remove all references to entry, without blocking.
600          * Whenever we reach init_mm, there's no address space
601          * to search, but use it as a reminder to search shmem.
602          */
603         shmem = 0;
604         swcount = *swap_map;
605         if (swcount > 1) {
606             flush_page_to_ram(page);
607             if (start_mm == &init_mm)
608                 shmem = shmem_unuse(entry, page);
609             else
610                 unuse_process(start_mm, entry, page);
611         }
612         if (*swap_map > 1) {
613             int set_start_mm = (*swap_map >= swcount);
614             struct list_head *p = &start_mm->mmlist;
615             struct mm_struct *new_start_mm = start_mm;
616             struct mm_struct *mm;
617 
618             spin_lock(&mmlist_lock);
619             while (*swap_map > 1 &&
620                 (p = p->next) != &start_mm->mmlist) {
621                 mm = list_entry(p, struct mm_struct,
                            mmlist);
622                 swcount = *swap_map;
623                 if (mm == &init_mm) {
624                     set_start_mm = 1;
625                     spin_unlock(&mmlist_lock);
626                     shmem = shmem_unuse(entry, page);
627                     spin_lock(&mmlist_lock);
628                 } else
629                     unuse_process(mm, entry, page);
630                 if (set_start_mm && *swap_map < swcount) {
631                     new_start_mm = mm;
632                     set_start_mm = 0;
633                 }
634             }
635             atomic_inc(&new_start_mm->mm_users);
636             spin_unlock(&mmlist_lock);
637             mmput(start_mm);
638             start_mm = new_start_mm;
639         }
654         if (*swap_map == SWAP_MAP_MAX) {
655             swap_list_lock();
656             swap_device_lock(si);
657             nr_swap_pages++;
658             *swap_map = 1;
659             swap_device_unlock(si);
660             swap_list_unlock();
661             reset_overflow = 1;
662         }
683         if ((*swap_map > 1) && PageDirty(page) &&
                PageSwapCache(page)) {
684             rw_swap_page(WRITE, page);
685             lock_page(page);
686         }
687         if (PageSwapCache(page)) {
688             if (shmem)
689                 swap_duplicate(entry);
690             else
691                 delete_from_swap_cache(page);
692         }
699 SetPageDirty(page); 700 UnlockPage(page); 701 page_cache_release(page);
708         if (current->need_resched)
714             schedule();
715     }
716 
717     mmput(start_mm);
718     if (reset_overflow) {
714         printk(KERN_WARNING "swapoff: cleared swap entry
                     overflow\n");
715         swap_overflow = 0;
716     }
717     return retval;
718 }
This function begins the page table walk required to remove the requested page and entry from the process page tables managed by mm. This is only required when a swap area is being deactivated so, while expensive, it is a very rare operation. This set of functions should be instantly recognisable as a standard page-table walk.
454 static void unuse_process(struct mm_struct * mm,
455                         swp_entry_t entry, struct page* page)
456 {
457     struct vm_area_struct* vma;
458 
459     /*
460      * Go through process' page directory.
461      */
462     spin_lock(&mm->page_table_lock);
463     for (vma = mm->mmap; vma; vma = vma->vm_next) {
464         pgd_t * pgd = pgd_offset(mm, vma->vm_start);
465         unuse_vma(vma, pgd, entry, page);
466     }
467     spin_unlock(&mm->page_table_lock);
468     return;
469 }
This function searches the requested VMA for page table entries mapping the page and using the given swap entry. It calls unuse_pgd() for every PGD this VMA maps.
440 static void unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir,
441                         swp_entry_t entry, struct page* page)
442 {
443     unsigned long start = vma->vm_start, end = vma->vm_end;
444 
445     if (start >= end)
446         BUG();
447     do {
448         unuse_pgd(vma, pgdir, start, end - start, entry, page);
449         start = (start + PGDIR_SIZE) & PGDIR_MASK;
450         pgdir++;
451     } while (start && (start < end));
452 }
This function searches the requested PGD for page table entries mapping the page and using the given swap entry. It calls unuse_pmd() for every PMD this PGD maps.
409 static inline void unuse_pgd(struct vm_area_struct * vma, pgd_t *dir,
410         unsigned long address, unsigned long size,
411         swp_entry_t entry, struct page* page)
412 {
413     pmd_t * pmd;
414     unsigned long offset, end;
415 
416     if (pgd_none(*dir))
417         return;
418     if (pgd_bad(*dir)) {
419         pgd_ERROR(*dir);
420         pgd_clear(dir);
421         return;
422     }
423     pmd = pmd_offset(dir, address);
424     offset = address & PGDIR_MASK;
425     address &= ~PGDIR_MASK;
426     end = address + size;
427     if (end > PGDIR_SIZE)
428         end = PGDIR_SIZE;
429     if (address >= end)
430         BUG();
431     do {
432         unuse_pmd(vma, pmd, address, end - address, offset, entry,
433                   page);
434         address = (address + PMD_SIZE) & PMD_MASK;
435         pmd++;
436     } while (address && (address < end));
437 }
This function searches the requested PMD for page table entries mapping the page and using the given swap entry. It calls unuse_pte() for every PTE this PMD maps.
381 static inline void unuse_pmd(struct vm_area_struct * vma, pmd_t *dir,
382      unsigned long address, unsigned long size, unsigned long offset,
383      swp_entry_t entry, struct page* page)
384 {
385     pte_t * pte;
386     unsigned long end;
387 
388     if (pmd_none(*dir))
389         return;
390     if (pmd_bad(*dir)) {
391         pmd_ERROR(*dir);
392         pmd_clear(dir);
393         return;
394     }
395     pte = pte_offset(dir, address);
396     offset += address & PMD_MASK;
397     address &= ~PMD_MASK;
398     end = address + size;
399     if (end > PMD_SIZE)
400         end = PMD_SIZE;
401     do {
402         unuse_pte(vma, offset+address-vma->vm_start, pte, entry, page);
403         address += PAGE_SIZE;
404         pte++;
405     } while (address && (address < end));
406 }
This function checks if the PTE at dir matches the entry we are searching for. If it does, the swap entry is freed and a reference is taken to the page representing the PTE that will be updated to map it.
365 static inline void unuse_pte(struct vm_area_struct * vma, 
            unsigned long address,
366         pte_t *dir, swp_entry_t entry, struct page* page)
367 {
368     pte_t pte = *dir;
369 
370     if (likely(pte_to_swp_entry(pte).val != entry.val))
371         return;
372     if (unlikely(pte_none(pte) || pte_present(pte)))
373         return;
374     get_page(page);
375     set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot)));
376     swap_free(entry);
377     ++vma->vm_mm->rss;
378 }