Embedded Freedom
• source navigation • diff markup • identifier search • freetext search •
1 /* 2 * Fast Userspace Mutexes (which I call "Futexes!"). 3 * (C) Rusty Russell, IBM 2002 4 * 5 * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar 6 * (C) Copyright 2003 Red Hat Inc, All Rights Reserved 7 * 8 * Removed page pinning, fix privately mapped COW pages and other cleanups 9 * (C) Copyright 2003, 2004 Jamie Lokier 10 * 11 * Robust futex support started by Ingo Molnar 12 * (C) Copyright 2006 Red Hat Inc, All Rights Reserved 13 * Thanks to Thomas Gleixner for suggestions, analysis and fixes. 14 * 15 * PI-futex support started by Ingo Molnar and Thomas Gleixner 16 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> 17 * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> 18 * 19 * PRIVATE futexes by Eric Dumazet 20 * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com> 21 * 22 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly 23 * enough at me, Linus for the original (flawed) idea, Matthew 24 * Kirkwood for proof-of-concept implementation. 25 * 26 * "The futexes are also cursed." 27 * "But they come in a choice of three flavours!" 28 * 29 * This program is free software; you can redistribute it and/or modify 30 * it under the terms of the GNU General Public License as published by 31 * the Free Software Foundation; either version 2 of the License, or 32 * (at your option) any later version. 33 * 34 * This program is distributed in the hope that it will be useful, 35 * but WITHOUT ANY WARRANTY; without even the implied warranty of 36 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 37 * GNU General Public License for more details. 38 * 39 * You should have received a copy of the GNU General Public License 40 * along with this program; if not, write to the Free Software 41 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 42 */ 43 #include <linux/slab.h> 44 #include <linux/poll.h> 45 #include <linux/fs.h> 46 #include <linux/file.h> 47 #include <linux/jhash.h> 48 #include <linux/init.h> 49 #include <linux/futex.h> 50 #include <linux/mount.h> 51 #include <linux/pagemap.h> 52 #include <linux/syscalls.h> 53 #include <linux/signal.h> 54 #include <linux/module.h> 55 #include <linux/magic.h> 56 #include <linux/pid.h> 57 #include <linux/nsproxy.h> 58 59 #include <asm/futex.h> 60 61 #include "rtmutex_common.h" 62 63 int __read_mostly futex_cmpxchg_enabled; 64 65 #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) 66 67 /* 68 * Priority Inheritance state: 69 */ 70 struct futex_pi_state { 71 /* 72 * list of 'owned' pi_state instances - these have to be 73 * cleaned up in do_exit() if the task exits prematurely: 74 */ 75 struct list_head list; 76 77 /* 78 * The PI object: 79 */ 80 struct rt_mutex pi_mutex; 81 82 struct task_struct *owner; 83 atomic_t refcount; 84 85 union futex_key key; 86 }; 87 88 /* 89 * We use this hashed waitqueue instead of a normal wait_queue_t, so 90 * we can wake only the relevant ones (hashed queues may be shared). 91 * 92 * A futex_q has a woken state, just like tasks have TASK_RUNNING. 93 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. 94 * The order of wakup is always to make the first condition true, then 95 * wake up q->waiters, then make the second condition true. 96 */ 97 struct futex_q { 98 struct plist_node list; 99 wait_queue_head_t waiters; 100 101 /* Which hash list lock to use: */ 102 spinlock_t *lock_ptr; 103 104 /* Key which the futex is hashed on: */ 105 union futex_key key; 106 107 /* Optional priority inheritance state: */ 108 struct futex_pi_state *pi_state; 109 struct task_struct *task; 110 111 /* Bitset for the optional bitmasked wakeup */ 112 u32 bitset; 113 }; 114 115 /* 116 * Split the global futex_lock into every hash list lock. 117 */ 118 struct futex_hash_bucket { 119 spinlock_t lock; 120 struct plist_head chain; 121 }; 122 123 static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS]; 124 125 /* 126 * Take mm->mmap_sem, when futex is shared 127 */ 128 static inline void futex_lock_mm(struct rw_semaphore *fshared) 129 { 130 if (fshared) 131 down_read(fshared); 132 } 133 134 /* 135 * Release mm->mmap_sem, when the futex is shared 136 */ 137 static inline void futex_unlock_mm(struct rw_semaphore *fshared) 138 { 139 if (fshared) 140 up_read(fshared); 141 } 142 143 /* 144 * We hash on the keys returned from get_futex_key (see below). 145 */ 146 static struct futex_hash_bucket *hash_futex(union futex_key *key) 147 { 148 u32 hash = jhash2((u32*)&key->both.word, 149 (sizeof(key->both.word)+sizeof(key->both.ptr))/4, 150 key->both.offset); 151 return &futex_queues[hash & ((1 << FUTEX_HASHBITS)-1)]; 152 } 153 154 /* 155 * Return 1 if two futex_keys are equal, 0 otherwise. 156 */ 157 static inline int match_futex(union futex_key *key1, union futex_key *key2) 158 { 159 return (key1->both.word == key2->both.word 160 && key1->both.ptr == key2->both.ptr 161 && key1->both.offset == key2->both.offset); 162 } 163 164 /** 165 * get_futex_key - Get parameters which are the keys for a futex. 166 * @uaddr: virtual address of the futex 167 * @shared: NULL for a PROCESS_PRIVATE futex, 168 * ¤t->mm->mmap_sem for a PROCESS_SHARED futex 169 * @key: address where result is stored. 170 * 171 * Returns a negative error code or 0 172 * The key words are stored in *key on success. 173 * 174 * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode, 175 * offset_within_page). For private mappings, it's (uaddr, current->mm). 176 * We can usually work out the index without swapping in the page. 177 * 178 * fshared is NULL for PROCESS_PRIVATE futexes 179 * For other futexes, it points to ¤t->mm->mmap_sem and 180 * caller must have taken the reader lock. but NOT any spinlocks. 181 */ 182 static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared, 183 union futex_key *key) 184 { 185 unsigned long address = (unsigned long)uaddr; 186 struct mm_struct *mm = current->mm; 187 struct vm_area_struct *vma; 188 struct page *page; 189 int err; 190 191 /* 192 * The futex address must be "naturally" aligned. 193 */ 194 key->both.offset = address % PAGE_SIZE; 195 if (unlikely((address % sizeof(u32)) != 0)) 196 return -EINVAL; 197 address -= key->both.offset; 198 199 /* 200 * PROCESS_PRIVATE futexes are fast. 201 * As the mm cannot disappear under us and the 'key' only needs 202 * virtual address, we dont even have to find the underlying vma. 203 * Note : We do have to check 'uaddr' is a valid user address, 204 * but access_ok() should be faster than find_vma() 205 */ 206 if (!fshared) { 207 if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))) 208 return -EFAULT; 209 key->private.mm = mm; 210 key->private.address = address; 211 return 0; 212 } 213 /* 214 * The futex is hashed differently depending on whether 215 * it's in a shared or private mapping. So check vma first. 216 */ 217 vma = find_extend_vma(mm, address); 218 if (unlikely(!vma)) 219 return -EFAULT; 220 221 /* 222 * Permissions. 223 */ 224 if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ)) 225 return (vma->vm_flags & VM_IO) ? -EPERM : -EACCES; 226 227 /* 228 * Private mappings are handled in a simple way. 229 * 230 * NOTE: When userspace waits on a MAP_SHARED mapping, even if 231 * it's a read-only handle, it's expected that futexes attach to 232 * the object not the particular process. Therefore we use 233 * VM_MAYSHARE here, not VM_SHARED which is restricted to shared 234 * mappings of _writable_ handles. 235 */ 236 if (likely(!(vma->vm_flags & VM_MAYSHARE))) { 237 key->both.offset |= FUT_OFF_MMSHARED; /* reference taken on mm */ 238 key->private.mm = mm; 239 key->private.address = address; 240 return 0; 241 } 242 243 /* 244 * Linear file mappings are also simple. 245 */ 246 key->shared.inode = vma->vm_file->f_path.dentry->d_inode; 247 key->both.offset |= FUT_OFF_INODE; /* inode-based key. */ 248 if (likely(!(vma->vm_flags & VM_NONLINEAR))) { 249 key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT) 250 + vma->vm_pgoff); 251 return 0; 252 } 253 254 /* 255 * We could walk the page table to read the non-linear 256 * pte, and get the page index without fetching the page 257 * from swap. But that's a lot of code to duplicate here 258 * for a rare case, so we simply fetch the page. 259 */ 260 err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL); 261 if (err >= 0) { 262 key->shared.pgoff = 263 page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 264 put_page(page); 265 return 0; 266 } 267 return err; 268 } 269 270 /* 271 * Take a reference to the resource addressed by a key. 272 * Can be called while holding spinlocks. 273 * 274 */ 275 static void get_futex_key_refs(union futex_key *key) 276 { 277 if (key->both.ptr == NULL) 278 return; 279 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { 280 case FUT_OFF_INODE: 281 atomic_inc(&key->shared.inode->i_count); 282 break; 283 case FUT_OFF_MMSHARED: 284 atomic_inc(&key->private.mm->mm_count); 285 break; 286 } 287 } 288 289 /* 290 * Drop a reference to the resource addressed by a key. 291 * The hash bucket spinlock must not be held. 292 */ 293 static void drop_futex_key_refs(union futex_key *key) 294 { 295 if (!key->both.ptr) 296 return; 297 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { 298 case FUT_OFF_INODE: 299 iput(key->shared.inode); 300 break; 301 case FUT_OFF_MMSHARED: 302 mmdrop(key->private.mm); 303 break; 304 } 305 } 306 307 static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval) 308 { 309 u32 curval; 310 311 pagefault_disable(); 312 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); 313 pagefault_enable(); 314 315 return curval; 316 } 317 318 static int get_futex_value_locked(u32 *dest, u32 __user *from) 319 { 320 int ret; 321 322 pagefault_disable(); 323 ret = __copy_from_user_inatomic(dest, from, sizeof(u32)); 324 pagefault_enable(); 325 326 return ret ? -EFAULT : 0; 327 } 328 329 /* 330 * Fault handling. 331 * if fshared is non NULL, current->mm->mmap_sem is already held 332 */ 333 static int futex_handle_fault(unsigned long address, 334 struct rw_semaphore *fshared, int attempt) 335 { 336 struct vm_area_struct * vma; 337 struct mm_struct *mm = current->mm; 338 int ret = -EFAULT; 339 340 if (attempt > 2) 341 return ret; 342 343 if (!fshared) 344 down_read(&mm->mmap_sem); 345 vma = find_vma(mm, address); 346 if (vma && address >= vma->vm_start && 347 (vma->vm_flags & VM_WRITE)) { 348 int fault; 349 fault = handle_mm_fault(mm, vma, address, 1); 350 if (unlikely((fault & VM_FAULT_ERROR))) { 351 #if 0 352 /* XXX: let's do this when we verify it is OK */ 353 if (ret & VM_FAULT_OOM) 354 ret = -ENOMEM; 355 #endif 356 } else { 357 ret = 0; 358 if (fault & VM_FAULT_MAJOR) 359 current->maj_flt++; 360 else 361 current->min_flt++; 362 } 363 } 364 if (!fshared) 365 up_read(&mm->mmap_sem); 366 return ret; 367 } 368 369 /* 370 * PI code: 371 */ 372 static int refill_pi_state_cache(void) 373 { 374 struct futex_pi_state *pi_state; 375 376 if (likely(current->pi_state_cache)) 377 return 0; 378 379 pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL); 380 381 if (!pi_state) 382 return -ENOMEM; 383 384 INIT_LIST_HEAD(&pi_state->list); 385 /* pi_mutex gets initialized later */ 386 pi_state->owner = NULL; 387 atomic_set(&pi_state->refcount, 1); 388 389 current->pi_state_cache = pi_state; 390 391 return 0; 392 } 393 394 static struct futex_pi_state * alloc_pi_state(void) 395 { 396 struct futex_pi_state *pi_state = current->pi_state_cache; 397 398 WARN_ON(!pi_state); 399 current->pi_state_cache = NULL; 400 401 return pi_state; 402 } 403 404 static void free_pi_state(struct futex_pi_state *pi_state) 405 { 406 if (!atomic_dec_and_test(&pi_state->refcount)) 407 return; 408 409 /* 410 * If pi_state->owner is NULL, the owner is most probably dying 411 * and has cleaned up the pi_state already 412 */ 413 if (pi_state->owner) { 414 spin_lock_irq(&pi_state->owner->pi_lock); 415 list_del_init(&pi_state->list); 416 spin_unlock_irq(&pi_state->owner->pi_lock); 417 418 rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner); 419 } 420 421 if (current->pi_state_cache) 422 kfree(pi_state); 423 else { 424 /* 425 * pi_state->list is already empty. 426 * clear pi_state->owner. 427 * refcount is at 0 - put it back to 1. 428 */ 429 pi_state->owner = NULL; 430 atomic_set(&pi_state->refcount, 1); 431 current->pi_state_cache = pi_state; 432 } 433 } 434 435 /* 436 * Look up the task based on what TID userspace gave us. 437 * We dont trust it. 438 */ 439 static struct task_struct * futex_find_get_task(pid_t pid) 440 { 441 struct task_struct *p; 442 443 rcu_read_lock(); 444 p = find_task_by_vpid(pid); 445 if (!p || ((current->euid != p->euid) && (current->euid != p->uid))) 446 p = ERR_PTR(-ESRCH); 447 else 448 get_task_struct(p); 449 450 rcu_read_unlock(); 451 452 return p; 453 } 454 455 /* 456 * This task is holding PI mutexes at exit time => bad. 457 * Kernel cleans up PI-state, but userspace is likely hosed. 458 * (Robust-futex cleanup is separate and might save the day for userspace.) 459 */ 460 void exit_pi_state_list(struct task_struct *curr) 461 { 462 struct list_head *next, *head = &curr->pi_state_list; 463 struct futex_pi_state *pi_state; 464 struct futex_hash_bucket *hb; 465 union futex_key key; 466 467 if (!futex_cmpxchg_enabled) 468 return; 469 /* 470 * We are a ZOMBIE and nobody can enqueue itself on 471 * pi_state_list anymore, but we have to be careful 472 * versus waiters unqueueing themselves: 473 */ 474 spin_lock_irq(&curr->pi_lock); 475 while (!list_empty(head)) { 476 477 next = head->next; 478 pi_state = list_entry(next, struct futex_pi_state, list); 479 key = pi_state->key; 480 hb = hash_futex(&key); 481 spin_unlock_irq(&curr->pi_lock); 482 483 spin_lock(&hb->lock); 484 485 spin_lock_irq(&curr->pi_lock); 486 /* 487 * We dropped the pi-lock, so re-check whether this 488 * task still owns the PI-state: 489 */ 490 if (head->next != next) { 491 spin_unlock(&hb->lock); 492 continue; 493 } 494 495 WARN_ON(pi_state->owner != curr); 496 WARN_ON(list_empty(&pi_state->list)); 497 list_del_init(&pi_state->list); 498 pi_state->owner = NULL; 499 spin_unlock_irq(&curr->pi_lock); 500 501 rt_mutex_unlock(&pi_state->pi_mutex); 502 503 spin_unlock(&hb->lock); 504 505 spin_lock_irq(&curr->pi_lock); 506 } 507 spin_unlock_irq(&curr->pi_lock); 508 } 509 510 static int 511 lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, 512 union futex_key *key, struct futex_pi_state **ps) 513 { 514 struct futex_pi_state *pi_state = NULL; 515 struct futex_q *this, *next; 516 struct plist_head *head; 517 struct task_struct *p; 518 pid_t pid = uval & FUTEX_TID_MASK; 519 520 head = &hb->chain; 521 522 plist_for_each_entry_safe(this, next, head, list) { 523 if (match_futex(&this->key, key)) { 524 /* 525 * Another waiter already exists - bump up 526 * the refcount and return its pi_state: 527 */ 528 pi_state = this->pi_state; 529 /* 530 * Userspace might have messed up non PI and PI futexes 531 */ 532 if (unlikely(!pi_state)) 533 return -EINVAL; 534 535 WARN_ON(!atomic_read(&pi_state->refcount)); 536 WARN_ON(pid && pi_state->owner && 537 pi_state->owner->pid != pid); 538 539 atomic_inc(&pi_state->refcount); 540 *ps = pi_state; 541 542 return 0; 543 } 544 } 545 546 /* 547 * We are the first waiter - try to look up the real owner and attach 548 * the new pi_state to it, but bail out when TID = 0 549 */ 550 if (!pid) 551 return -ESRCH; 552 p = futex_find_get_task(pid); 553 if (IS_ERR(p)) 554 return PTR_ERR(p); 555 556 /* 557 * We need to look at the task state flags to figure out, 558 * whether the task is exiting. To protect against the do_exit 559 * change of the task flags, we do this protected by 560 * p->pi_lock: 561 */ 562 spin_lock_irq(&p->pi_lock); 563 if (unlikely(p->flags & PF_EXITING)) { 564 /* 565 * The task is on the way out. When PF_EXITPIDONE is 566 * set, we know that the task has finished the 567 * cleanup: 568 */ 569 int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN; 570 571 spin_unlock_irq(&p->pi_lock); 572 put_task_struct(p); 573 return ret; 574 } 575 576 pi_state = alloc_pi_state(); 577 578 /* 579 * Initialize the pi_mutex in locked state and make 'p' 580 * the owner of it: 581 */ 582 rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); 583 584 /* Store the key for possible exit cleanups: */ 585 pi_state->key = *key; 586 587 WARN_ON(!list_empty(&pi_state->list)); 588 list_add(&pi_state->list, &p->pi_state_list); 589 pi_state->owner = p; 590 spin_unlock_irq(&p->pi_lock); 591 592 put_task_struct(p); 593 594 *ps = pi_state; 595 596 return 0; 597 } 598 599 /* 600 * The hash bucket lock must be held when this is called. 601 * Afterwards, the futex_q must not be accessed. 602 */ 603 static void wake_futex(struct futex_q *q) 604 { 605 plist_del(&q->list, &q->list.plist); 606 /* 607 * The lock in wake_up_all() is a crucial memory barrier after the 608 * plist_del() and also before assigning to q->lock_ptr. 609 */ 610 wake_up_all(&q->waiters); 611 /* 612 * The waiting task can free the futex_q as soon as this is written, 613 * without taking any locks. This must come last. 614 * 615 * A memory barrier is required here to prevent the following store 616 * to lock_ptr from getting ahead of the wakeup. Clearing the lock 617 * at the end of wake_up_all() does not prevent this store from 618 * moving. 619 */ 620 smp_wmb(); 621 q->lock_ptr = NULL; 622 } 623 624 static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) 625 { 626 struct task_struct *new_owner; 627 struct futex_pi_state *pi_state = this->pi_state; 628 u32 curval, newval; 629 630 if (!pi_state) 631 return -EINVAL; 632 633 spin_lock(&pi_state->pi_mutex.wait_lock); 634 new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); 635 636 /* 637 * This happens when we have stolen the lock and the original 638 * pending owner did not enqueue itself back on the rt_mutex. 639 * Thats not a tragedy. We know that way, that a lock waiter 640 * is on the fly. We make the futex_q waiter the pending owner. 641 */ 642 if (!new_owner) 643 new_owner = this->task; 644 645 /* 646 * We pass it to the next owner. (The WAITERS bit is always 647 * kept enabled while there is PI state around. We must also 648 * preserve the owner died bit.) 649 */ 650 if (!(uval & FUTEX_OWNER_DIED)) { 651 int ret = 0; 652 653 newval = FUTEX_WAITERS | task_pid_vnr(new_owner); 654 655 curval = cmpxchg_futex_value_locked(uaddr, uval, newval); 656 657 if (curval == -EFAULT) 658 ret = -EFAULT; 659 else if (curval != uval) 660 ret = -EINVAL; 661 if (ret) { 662 spin_unlock(&pi_state->pi_mutex.wait_lock); 663 return ret; 664 } 665 } 666 667 spin_lock_irq(&pi_state->owner->pi_lock); 668 WARN_ON(list_empty(&pi_state->list)); 669 list_del_init(&pi_state->list); 670 spin_unlock_irq(&pi_state->owner->pi_lock); 671 672 spin_lock_irq(&new_owner->pi_lock); 673 WARN_ON(!list_empty(&pi_state->list)); 674 list_add(&pi_state->list, &new_owner->pi_state_list); 675 pi_state->owner = new_owner; 676 spin_unlock_irq(&new_owner->pi_lock); 677 678 spin_unlock(&pi_state->pi_mutex.wait_lock); 679 rt_mutex_unlock(&pi_state->pi_mutex); 680 681 return 0; 682 } 683 684 static int unlock_futex_pi(u32 __user *uaddr, u32 uval) 685 { 686 u32 oldval; 687 688 /* 689 * There is no waiter, so we unlock the futex. The owner died 690 * bit has not to be preserved here. We are the owner: 691 */ 692 oldval = cmpxchg_futex_value_locked(uaddr, uval, 0); 693 694 if (oldval == -EFAULT) 695 return oldval; 696 if (oldval != uval) 697 return -EAGAIN; 698 699 return 0; 700 } 701 702 /* 703 * Express the locking dependencies for lockdep: 704 */ 705 static inline void 706 double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) 707 { 708 if (hb1 <= hb2) { 709 spin_lock(&hb1->lock); 710 if (hb1 < hb2) 711 spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING); 712 } else { /* hb1 > hb2 */ 713 spin_lock(&hb2->lock); 714 spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING); 715 } 716 } 717 718 /* 719 * Wake up all waiters hashed on the physical page that is mapped 720 * to this virtual address: 721 */ 722 static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared, 723 int nr_wake, u32 bitset) 724 { 725 struct futex_hash_bucket *hb; 726 struct futex_q *this, *next; 727 struct plist_head *head; 728 union futex_key key; 729 int ret; 730 731 if (!bitset) 732 return -EINVAL; 733 734 futex_lock_mm(fshared); 735 736 ret = get_futex_key(uaddr, fshared, &key); 737 if (unlikely(ret != 0)) 738 goto out; 739 740 hb = hash_futex(&key); 741 spin_lock(&hb->lock); 742 head = &hb->chain; 743 744 plist_for_each_entry_safe(this, next, head, list) { 745 if (match_futex (&this->key, &key)) { 746 if (this->pi_state) { 747 ret = -EINVAL; 748 break; 749 } 750 751 /* Check if one of the bits is set in both bitsets */ 752 if (!(this->bitset & bitset)) 753 continue; 754 755 wake_futex(this); 756 if (++ret >= nr_wake) 757 break; 758 } 759 } 760 761 spin_unlock(&hb->lock); 762 out: 763 futex_unlock_mm(fshared); 764 return ret; 765 } 766 767 /* 768 * Wake up all waiters hashed on the physical page that is mapped 769 * to this virtual address: 770 */ 771 static int 772 futex_wake_op(u32 __user *uaddr1, struct rw_semaphore *fshared, 773 u32 __user *uaddr2, 774 int nr_wake, int nr_wake2, int op) 775 { 776 union futex_key key1, key2; 777 struct futex_hash_bucket *hb1, *hb2; 778 struct plist_head *head; 779 struct futex_q *this, *next; 780 int ret, op_ret, attempt = 0; 781 782 retryfull: 783 futex_lock_mm(fshared); 784 785 ret = get_futex_key(uaddr1, fshared, &key1); 786 if (unlikely(ret != 0)) 787 goto out; 788 ret = get_futex_key(uaddr2, fshared, &key2); 789 if (unlikely(ret != 0)) 790 goto out; 791 792 hb1 = hash_futex(&key1); 793 hb2 = hash_futex(&key2); 794 795 retry: 796 double_lock_hb(hb1, hb2); 797 798 op_ret = futex_atomic_op_inuser(op, uaddr2); 799 if (unlikely(op_ret < 0)) { 800 u32 dummy; 801 802 spin_unlock(&hb1->lock); 803 if (hb1 != hb2) 804 spin_unlock(&hb2->lock); 805 806 #ifndef CONFIG_MMU 807 /* 808 * we don't get EFAULT from MMU faults if we don't have an MMU, 809 * but we might get them from range checking 810 */ 811 ret = op_ret; 812 goto out; 813 #endif 814 815 if (unlikely(op_ret != -EFAULT)) { 816 ret = op_ret; 817 goto out; 818 } 819 820 /* 821 * futex_atomic_op_inuser needs to both read and write 822 * *(int __user *)uaddr2, but we can't modify it 823 * non-atomically. Therefore, if get_user below is not 824 * enough, we need to handle the fault ourselves, while 825 * still holding the mmap_sem. 826 */ 827 if (attempt++) { 828 ret = futex_handle_fault((unsigned long)uaddr2, 829 fshared, attempt); 830 if (ret) 831 goto out; 832 goto retry; 833 } 834 835 /* 836 * If we would have faulted, release mmap_sem, 837 * fault it in and start all over again. 838 */ 839 futex_unlock_mm(fshared); 840 841 ret = get_user(dummy, uaddr2); 842 if (ret) 843 return ret; 844 845 goto retryfull; 846 } 847 848 head = &hb1->chain; 849 850 plist_for_each_entry_safe(this, next, head, list) { 851 if (match_futex (&this->key, &key1)) { 852 wake_futex(this); 853 if (++ret >= nr_wake) 854 break; 855 } 856 } 857 858 if (op_ret > 0) { 859 head = &hb2->chain; 860 861 op_ret = 0; 862 plist_for_each_entry_safe(this, next, head, list) { 863 if (match_futex (&this->key, &key2)) { 864 wake_futex(this); 865 if (++op_ret >= nr_wake2) 866 break; 867 } 868 } 869 ret += op_ret; 870 } 871 872 spin_unlock(&hb1->lock); 873 if (hb1 != hb2) 874 spin_unlock(&hb2->lock); 875 out: 876 futex_unlock_mm(fshared); 877 878 return ret; 879 } 880 881 /* 882 * Requeue all waiters hashed on one physical page to another 883 * physical page. 884 */ 885 static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared, 886 u32 __user *uaddr2, 887 int nr_wake, int nr_requeue, u32 *cmpval) 888 { 889 union futex_key key1, key2; 890 struct futex_hash_bucket *hb1, *hb2; 891 struct plist_head *head1; 892 struct futex_q *this, *next; 893 int ret, drop_count = 0; 894 895 retry: 896 futex_lock_mm(fshared); 897 898 ret = get_futex_key(uaddr1, fshared, &key1); 899 if (unlikely(ret != 0)) 900 goto out; 901 ret = get_futex_key(uaddr2, fshared, &key2); 902 if (unlikely(ret != 0)) 903 goto out; 904 905 hb1 = hash_futex(&key1); 906 hb2 = hash_futex(&key2); 907 908 double_lock_hb(hb1, hb2); 909 910 if (likely(cmpval != NULL)) { 911 u32 curval; 912 913 ret = get_futex_value_locked(&curval, uaddr1); 914 915 if (unlikely(ret)) { 916 spin_unlock(&hb1->lock); 917 if (hb1 != hb2) 918 spin_unlock(&hb2->lock); 919 920 /* 921 * If we would have faulted, release mmap_sem, fault 922 * it in and start all over again. 923 */ 924 futex_unlock_mm(fshared); 925 926 ret = get_user(curval, uaddr1); 927 928 if (!ret) 929 goto retry; 930 931 return ret; 932 } 933 if (curval != *cmpval) { 934 ret = -EAGAIN; 935 goto out_unlock; 936 } 937 } 938 939 head1 = &hb1->chain; 940 plist_for_each_entry_safe(this, next, head1, list) { 941 if (!match_futex (&this->key, &key1)) 942 continue; 943 if (++ret <= nr_wake) { 944 wake_futex(this); 945 } else { 946 /* 947 * If key1 and key2 hash to the same bucket, no need to 948 * requeue. 949 */ 950 if (likely(head1 != &hb2->chain)) { 951 plist_del(&this->list, &hb1->chain); 952 plist_add(&this->list, &hb2->chain); 953 this->lock_ptr = &hb2->lock; 954 #ifdef CONFIG_DEBUG_PI_LIST 955 this->list.plist.lock = &hb2->lock; 956 #endif 957 } 958 this->key = key2; 959 get_futex_key_refs(&key2); 960 drop_count++; 961 962 if (ret - nr_wake >= nr_requeue) 963 break; 964 } 965 } 966 967 out_unlock: 968 spin_unlock(&hb1->lock); 969 if (hb1 != hb2) 970 spin_unlock(&hb2->lock); 971 972 /* drop_futex_key_refs() must be called outside the spinlocks. */ 973 while (--drop_count >= 0) 974 drop_futex_key_refs(&key1); 975 976 out: 977 futex_unlock_mm(fshared); 978 return ret; 979 } 980 981 /* The key must be already stored in q->key. */ 982 static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) 983 { 984 struct futex_hash_bucket *hb; 985 986 init_waitqueue_head(&q->waiters); 987 988 get_futex_key_refs(&q->key); 989 hb = hash_futex(&q->key); 990 q->lock_ptr = &hb->lock; 991 992 spin_lock(&hb->lock); 993 return hb; 994 } 995 996 static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) 997 { 998 int prio; 999 1000 /* 1001 * The priority used to register this element is 1002 * - either the real thread-priority for the real-time threads 1003 * (i.e. threads with a priority lower than MAX_RT_PRIO) 1004 * - or MAX_RT_PRIO for non-RT threads. 1005 * Thus, all RT-threads are woken first in priority order, and 1006 * the others are woken last, in FIFO order. 1007 */ 1008 prio = min(current->normal_prio, MAX_RT_PRIO); 1009 1010 plist_node_init(&q->list, prio); 1011 #ifdef CONFIG_DEBUG_PI_LIST 1012 q->list.plist.lock = &hb->lock; 1013 #endif 1014 plist_add(&q->list, &hb->chain); 1015 q->task = current; 1016 spin_unlock(&hb->lock); 1017 } 1018 1019 static inline void 1020 queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) 1021 { 1022 spin_unlock(&hb->lock); 1023 drop_futex_key_refs(&q->key); 1024 } 1025 1026 /* 1027 * queue_me and unqueue_me must be called as a pair, each 1028 * exactly once. They are called with the hashed spinlock held. 1029 */ 1030 1031 /* Return 1 if we were still queued (ie. 0 means we were woken) */ 1032 static int unqueue_me(struct futex_q *q) 1033 { 1034 spinlock_t *lock_ptr; 1035 int ret = 0; 1036 1037 /* In the common case we don't take the spinlock, which is nice. */ 1038 retry: 1039 lock_ptr = q->lock_ptr; 1040 barrier(); 1041 if (lock_ptr != NULL) { 1042 spin_lock(lock_ptr); 1043 /* 1044 * q->lock_ptr can change between reading it and 1045 * spin_lock(), causing us to take the wrong lock. This 1046 * corrects the race condition. 1047 * 1048 * Reasoning goes like this: if we have the wrong lock, 1049 * q->lock_ptr must have changed (maybe several times) 1050 * between reading it and the spin_lock(). It can 1051 * change again after the spin_lock() but only if it was 1052 * already changed before the spin_lock(). It cannot, 1053 * however, change back to the original value. Therefore 1054 * we can detect whether we acquired the correct lock. 1055 */ 1056 if (unlikely(lock_ptr != q->lock_ptr)) { 1057 spin_unlock(lock_ptr); 1058 goto retry; 1059 } 1060 WARN_ON(plist_node_empty(&q->list)); 1061 plist_del(&q->list, &q->list.plist); 1062 1063 BUG_ON(q->pi_state); 1064 1065 spin_unlock(lock_ptr); 1066 ret = 1; 1067 } 1068 1069 drop_futex_key_refs(&q->key); 1070 return ret; 1071 } 1072 1073 /* 1074 * PI futexes can not be requeued and must remove themself from the 1075 * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry 1076 * and dropped here. 1077 */ 1078 static void unqueue_me_pi(struct futex_q *q) 1079 { 1080 WARN_ON(plist_node_empty(&q->list)); 1081 plist_del(&q->list, &q->list.plist); 1082 1083 BUG_ON(!q->pi_state); 1084 free_pi_state(q->pi_state); 1085 q->pi_state = NULL; 1086 1087 spin_unlock(q->lock_ptr); 1088 1089 drop_futex_key_refs(&q->key); 1090 } 1091 1092 /* 1093 * Fixup the pi_state owner with the new owner. 1094 * 1095 * Must be called with hash bucket lock held and mm->sem held for non 1096 * private futexes. 1097 */ 1098 static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, 1099 struct task_struct *newowner, 1100 struct rw_semaphore *fshared) 1101 { 1102 u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; 1103 struct futex_pi_state *pi_state = q->pi_state; 1104 struct task_struct *oldowner = pi_state->owner; 1105 u32 uval, curval, newval; 1106 int ret, attempt = 0; 1107 1108 /* Owner died? */ 1109 if (!pi_state->owner) 1110 newtid |= FUTEX_OWNER_DIED; 1111 1112 /* 1113 * We are here either because we stole the rtmutex from the 1114 * pending owner or we are the pending owner which failed to 1115 * get the rtmutex. We have to replace the pending owner TID 1116 * in the user space variable. This must be atomic as we have 1117 * to preserve the owner died bit here. 1118 * 1119 * Note: We write the user space value _before_ changing the 1120 * pi_state because we can fault here. Imagine swapped out