InnoDB 中的锁实现

InnoDB 中，所有事务加的行锁通过一个全局的 hash 表 lock_sys 维护：

/* The lock system */

lock_sys_t *lock_sys = NULL;

/** The lock system struct */

struct lock_sys_t {

  char pad1[INNOBASE_CACHE_LINE_SIZE];

  /*!< padding to prevent other

  memory update hotspots from

  residing on the same memory

  cache line */

  LockMutex mutex;              /*!< Mutex protecting the

                                locks */

  hash_table_t *rec_hash;       /*!< hash table of the record

                                locks */

  hash_table_t *prdt_hash;      /*!< hash table of the predicate

                                lock */

  hash_table_t *prdt_page_hash; /*!< hash table of the page

                                lock */

  char pad2[INNOBASE_CACHE_LINE_SIZE]; /*!< Padding */

  LockMutex wait_mutex;                /*!< Mutex protecting the

                                       next two fields */

  srv_slot_t *waiting_threads;         /*!< Array  of user threads

                                       suspended while waiting for

                                       locks within InnoDB, protected

                                       by the lock_sys->wait_mutex */

  srv_slot_t *last_slot;               /*!< highest slot ever used

                                       in the waiting_threads array,

                                       protected by

                                       lock_sys->wait_mutex */

  int n_waiting;                       /*!< Number of slots in use.

                                       Protected by lock_sys->mutex */

  ibool rollback_complete;

  /*!< TRUE if rollback of all

  recovered transactions is

  complete. Protected by

  lock_sys->mutex */

  ulint n_lock_max_wait_time; /*!< Max wait time */

  os_event_t timeout_event; /*!< Set to the event that is

                            created in the lock wait monitor

                            thread. A value of 0 means the

                            thread is not active */

  /** Marker value before trx_t::age. */

  uint64_t mark_age_updated;

#ifdef UNIV_DEBUG

  /** Lock timestamp counter */

  uint64_t m_seq;

#endif /* UNIV_DEBUG */

};

在进一步介绍行锁实现之前，先介绍一下InnoDB中事务与锁相关的数据结构

struct trx_t {

  enum isolation_level_t {

    /** dirty read: non-locking SELECTs are performed so that we

    do not look at a possible earlier version of a record; thus

    they are not 'consistent' reads under this isolation level;

    otherwise like level 2 */

    READ_UNCOMMITTED,

    /** somewhat Oracle-like isolation, except that in range UPDATE

    and DELETE we must block phantom rows with next-key locks;

    SELECT ... FOR UPDATE and ...  LOCK IN SHARE MODE only lock

    the index records, NOT the gaps before them, and thus allow

    free inserting; each consistent read reads its own snapshot */

    READ_COMMITTED,

    /** this is the default; all consistent reads in the same trx

    read the same snapshot; full next-key locking used in locking

    reads to block insertions into gaps */

    REPEATABLE_READ,

    /** all plain SELECTs are converted to LOCK IN SHARE MODE

    reads */

    SERIALIZABLE

  };

  TrxMutex mutex; /*!< Mutex protecting the fields

                  state and lock (except some fields

                  of lock, which are protected by

                  lock_sys->mutex) */

  bool owns_mutex; /*!< Set to the transaction that owns

                   the mutex during lock acquire and/or

                   release.

                   This is used to avoid taking the

                   trx_t::mutex recursively. */

  /* Note: in_depth was split from in_innodb for fixing a RO

  performance issue. Acquiring the trx_t::mutex for each row

  costs ~3% in performance. It is not required for correctness.

  Therefore we increment/decrement in_depth without holding any

  mutex. The assumption is that the Server will only ever call

  the handler from one thread. This is not true for kill_connection.

  Therefore in innobase_kill_connection. We don't increment this

  counter via TrxInInnoDB. */

  ib_uint32_t in_depth; /*!< Track nested TrxInInnoDB

                        count */

  ib_uint32_t in_innodb; /*!< if the thread is executing

                         in the InnoDB context count > 0. */

  bool abort; /*!< if this flag is set then

              this transaction must abort when

              it can */

  trx_id_t id; /*!< transaction id */

  trx_id_t no; /*!< transaction serialization number:

               max trx id shortly before the

               transaction is moved to

               COMMITTED_IN_MEMORY state.

               Protected by trx_sys_t::mutex

               when trx->in_rw_trx_list. Initially

               set to TRX_ID_MAX. */

  /** State of the trx from the point of view of concurrency control

  and the valid state transitions.

  Possible states:

  TRX_STATE_NOT_STARTED

  TRX_STATE_FORCED_ROLLBACK

  TRX_STATE_ACTIVE

  TRX_STATE_PREPARED

  TRX_STATE_COMMITTED_IN_MEMORY (alias below COMMITTED)

  Valid state transitions are:

  Regular transactions:

  * NOT_STARTED -> ACTIVE -> COMMITTED -> NOT_STARTED

  Auto-commit non-locking read-only:

  * NOT_STARTED -> ACTIVE -> NOT_STARTED

  XA (2PC):

  * NOT_STARTED -> ACTIVE -> PREPARED -> COMMITTED -> NOT_STARTED

  Recovered XA:

  * NOT_STARTED -> PREPARED -> COMMITTED -> (freed)

  XA (2PC) (shutdown or disconnect before ROLLBACK or COMMIT):

  * NOT_STARTED -> PREPARED -> (freed)

  Disconnected XA can become recovered:

  * ... -> ACTIVE -> PREPARED (connected) -> PREPARED (disconnected)

  Disconnected means from mysql e.g due to the mysql client disconnection.

  Latching and various transaction lists membership rules:

  XA (2PC) transactions are always treated as non-autocommit.

  Transitions to ACTIVE or NOT_STARTED occur when

  !in_rw_trx_list (no trx_sys->mutex needed).

  Autocommit non-locking read-only transactions move between states

  without holding any mutex. They are !in_rw_trx_list.

  All transactions, unless they are determined to be ac-nl-ro,

  explicitly tagged as read-only or read-write, will first be put

  on the read-only transaction list. Only when a !read-only transaction

  in the read-only list tries to acquire an X or IX lock on a table

  do we remove it from the read-only list and put it on the read-write

  list. During this switch we assign it a rollback segment.

  When a transaction is NOT_STARTED, it can be in_mysql_trx_list if

  it is a user transaction. It cannot be in rw_trx_list.

  ACTIVE->PREPARED->COMMITTED is only possible when trx->in_rw_trx_list.

  The transition ACTIVE->PREPARED is protected by trx_sys->mutex.

  ACTIVE->COMMITTED is possible when the transaction is in

  rw_trx_list.

  Transitions to COMMITTED are protected by both lock_sys->mutex

  and trx->mutex.

  NOTE: Some of these state change constraints are an overkill,

  currently only required for a consistent view for printing stats.

  This unnecessarily adds a huge cost for the general case. */

  trx_state_t state;

  /* If set, this transaction should stop inheriting (GAP)locks.

  Generally set to true during transaction prepare for RC or lower

  isolation, if requested. Needed for replication replay where

  we don't want to get blocked on GAP locks taken for protecting

  concurrent unique insert or replace operation. */

  bool skip_lock_inheritance;

  ReadView *read_view; /*!< consistent read view used in the

                       transaction, or NULL if not yet set */

  UT_LIST_NODE_T(trx_t)

  trx_list; /*!< list of transactions;

            protected by trx_sys->mutex. */

  UT_LIST_NODE_T(trx_t)

  no_list; /*!< Required during view creation

           to check for the view limit for

           transactions that are committing */

  trx_lock_t lock;   /*!< Information about the transaction

                     locks and state. Protected by

                     trx->mutex or lock_sys->mutex

                     or both */

  bool is_recovered; /*!< 0=normal transaction,

                     1=recovered, must be rolled back,

                     protected by trx_sys->mutex when

                     trx->in_rw_trx_list holds */

  hit_list_t hit_list; /*!< List of transactions to kill,

                       when a high priority transaction

                       is blocked on a lock wait. */

  os_thread_id_t killed_by; /*!< The thread ID that wants to

                            kill this transaction asynchronously.

                            This is required because we recursively

                            enter the handlerton methods and need

                            to distinguish between the kill thread

                            and the transaction thread.

                            Note: We need to be careful w.r.t the

                            Thread Pool. The thread doing the kill

                            should not leave InnoDB between the

                            mark and the actual async kill because

                            the running thread can change. */

  /* These fields are not protected by any mutex. */

  const char *op_info;   /*!< English text describing the

                         current operation, or an empty

                         string */

  ulint isolation_level; /*!< TRX_ISO_REPEATABLE_READ, ... */

  bool check_foreigns;   /*!< normally TRUE, but if the user

                         wants to suppress foreign key checks,

                         (in table imports, for example) we

                         set this FALSE */

  /*------------------------------*/

  /* MySQL has a transaction coordinator to coordinate two phase

  commit between multiple storage engines and the binary log. When

  an engine participates in a transaction, it's responsible for

  registering itself using the trans_register_ha() API. */

  bool is_registered; /* This flag is set to true after the

                      transaction has been registered with

                      the coordinator using the XA API, and

                      is set to false  after commit or

                      rollback. */

  /*------------------------------*/

  bool check_unique_secondary;

  /*!< normally TRUE, but if the user

  wants to speed up inserts by

  suppressing unique key checks

  for secondary indexes when we decide

  if we can use the insert buffer for

  them, we set this FALSE */

  bool flush_log_later;      /* In 2PC, we hold the

                             prepare_commit mutex across

                             both phases. In that case, we

                             defer flush of the logs to disk

                             until after we release the

                             mutex. */

  bool must_flush_log_later; /*!< this flag is set to TRUE in

                        trx_commit() if flush_log_later was

                        TRUE, and there were modifications by

                        the transaction; in that case we must

                        flush the log in

                        trx_commit_complete_for_mysql() */

  ulint duplicates;          /*!< TRX_DUP_IGNORE | TRX_DUP_REPLACE */

  bool has_search_latch;

  /*!< TRUE if this trx has latched the

  search system latch in S-mode.

  This now can only be true in

  row_search_mvcc, the btr search latch

  must has been released before exiting,

  and this flag would be set to false */

  trx_dict_op_t dict_operation; /**< @see enum trx_dict_op_t */

  bool ddl_operation;  /*!< True if this trx involves dd table

                        change */

  bool ddl_must_flush; /*!< True if this trx involves dd table

                       change, and must flush */

  bool in_truncate;    /* This trx is doing truncation */

  /* Fields protected by the srv_conc_mutex. */

  bool declared_to_be_inside_innodb;

  /*!< this is TRUE if we have declared

  this transaction in

  srv_conc_enter_innodb to be inside the

  InnoDB engine */

  ib_uint32_t n_tickets_to_enter_innodb;

  /*!< this can be > 0 only when

  declared_to_... is TRUE; when we come

  to srv_conc_innodb_enter, if the value

  here is > 0, we decrement this by 1 */

  ib_uint32_t dict_operation_lock_mode;

  /*!< 0, RW_S_LATCH, or RW_X_LATCH:

  the latch mode trx currently holds

  on dict_operation_lock. Protected

  by dict_operation_lock. */

  time_t start_time; /*!< time the state last time became

                     TRX_STATE_ACTIVE */

  /** Weight/Age of the transaction in the record lock wait queue. */

  int32_t age;

  /** For tracking if Weight/age has been updated. */

  uint64_t age_updated;

  lsn_t commit_lsn; /*!< lsn at the time of the commit */

  /*------------------------------*/

  THD *mysql_thd; /*!< MySQL thread handle corresponding

                  to this trx, or NULL */

  const char *mysql_log_file_name;

  /*!< if MySQL binlog is used, this field

  contains a pointer to the latest file

  name; this is NULL if binlog is not

  used */

  int64_t mysql_log_offset;

  /*!< if MySQL binlog is used, this

  field contains the end offset of the

  binlog entry */

  /*------------------------------*/

  ib_uint32_t n_mysql_tables_in_use; /*!< number of Innobase tables

                              used in the processing of the current

                              SQL statement in MySQL */

  ib_uint32_t mysql_n_tables_locked;

  /*!< how many tables the current SQL

  statement uses, except those

  in consistent read */

  /*------------------------------*/

#ifdef UNIV_DEBUG

  /** The following two fields are mutually exclusive. */

  /* @{ */

  bool in_rw_trx_list; /*!< true if in trx_sys->rw_trx_list */

                       /* @} */

#endif                 /* UNIV_DEBUG */

  UT_LIST_NODE_T(trx_t)

  mysql_trx_list; /*!< list of transactions created for

                  MySQL; protected by trx_sys->mutex */

#ifdef UNIV_DEBUG

  bool in_mysql_trx_list;

  /*!< true if in

  trx_sys->mysql_trx_list */

#endif /* UNIV_DEBUG */

  /*------------------------------*/

  dberr_t error_state;             /*!< 0 if no error, otherwise error

                                   number; NOTE That ONLY the thread

                                   doing the transaction is allowed to

                                   set this field: this is NOT protected

                                   by any mutex */

  const dict_index_t *error_index; /*!< if the error number indicates a

                                   duplicate key error, a pointer to

                                   the problematic index is stored here */

  ulint error_key_num;             /*!< if the index creation fails to a

                                   duplicate key error, a mysql key

                                   number of that index is stored here */

  sess_t *sess;                    /*!< session of the trx, NULL if none */

  que_t *graph;                    /*!< query currently run in the session,

                                   or NULL if none; NOTE that the query

                                   belongs to the session, and it can

                                   survive over a transaction commit, if

                                   it is a stored procedure with a COMMIT

                                   WORK statement, for instance */

  /*------------------------------*/

  UT_LIST_BASE_NODE_T(trx_named_savept_t)

  trx_savepoints; /*!< savepoints set with SAVEPOINT ...,

                  oldest first */

  /*------------------------------*/

  UndoMutex undo_mutex; /*!< mutex protecting the fields in this

                        section (down to undo_no_arr), EXCEPT

                        last_sql_stat_start, which can be

                        accessed only when we know that there

                        cannot be any activity in the undo

                        logs! */

  undo_no_t undo_no;    /*!< next undo log record number to

                        assign; since the undo log is

                        private for a transaction, this

                        is a simple ascending sequence

                        with no gaps; thus it represents

                        the number of modified/inserted

                        rows in a transaction */

  space_id_t undo_rseg_space;

  /*!< space id where last undo record

  was written */

  trx_savept_t last_sql_stat_start;

  /*!< undo_no when the last sql statement

  was started: in case of an error, trx

  is rolled back down to this undo

  number; see note at undo_mutex! */

  trx_rsegs_t rsegs;    /* rollback segments for undo logging */

  undo_no_t roll_limit; /*!< least undo number to undo during

                        a partial rollback; 0 otherwise */

#ifdef UNIV_DEBUG

  bool in_rollback;   /*!< true when the transaction is

                      executing a partial or full rollback */

#endif                /* UNIV_DEBUG */

  ulint pages_undone; /*!< number of undo log pages undone

                      since the last undo log truncation */

  /*------------------------------*/

  ulint n_autoinc_rows;       /*!< no. of AUTO-INC rows required for

                              an SQL statement. This is useful for

                              multi-row INSERTs */

  ib_vector_t *autoinc_locks; /* AUTOINC locks held by this

                              transaction. Note that these are

                              also in the lock list trx_locks. This

                              vector needs to be freed explicitly

                              when the trx instance is destroyed.

                              Protected by lock_sys->mutex. */

  /*------------------------------*/

  bool read_only;        /*!< true if transaction is flagged

                         as a READ-ONLY transaction.

                         if auto_commit && will_lock == 0

                         then it will be handled as a

                         AC-NL-RO-SELECT (Auto Commit Non-Locking

                         Read Only Select). A read only

                         transaction will not be assigned an

                         UNDO log. */

  bool auto_commit;      /*!< true if it is an autocommit */

  ib_uint32_t will_lock; /*!< Will acquire some locks. Increment

                         each time we determine that a lock will

                         be acquired by the MySQL layer. */

#ifndef UNIV_HOTBACKUP

  /*------------------------------*/

  fts_trx_t *fts_trx;       /*!< FTS information, or NULL if

                            transaction hasn't modified tables

                            with FTS indexes (yet). */

  doc_id_t fts_next_doc_id; /* The document id used for updates */

  /*------------------------------*/

  ib_uint32_t flush_tables; /*!< if "covering" the FLUSH TABLES",

                            count of tables being flushed. */

  /*------------------------------*/

  bool internal; /*!< true if it is a system/internal

                 transaction background task. Such

                 transactions are always treated as

                 read-write. */

                 /*------------------------------*/

#ifdef UNIV_DEBUG

  ulint start_line;       /*!< Track where it was started from */

  const char *start_file; /*!< Filename where it was started */

#endif                    /* UNIV_DEBUG */

  lint n_ref; /*!< Count of references, protected

              by trx_t::mutex. We can't release the

              locks nor commit the transaction until

              this reference is 0.  We can change

              the state to COMMITTED_IN_MEMORY to

              signify that it is no longer

              "active". */

  /** Version of this instance. It is incremented each time the

  instance is re-used in trx_start_low(). It is used to track

  whether a transaction has been restarted since it was tagged

  for asynchronous rollback. */

  ulint version;

  XID *xid;                    /*!< X/Open XA transaction

                               identification to identify a

                               transaction branch */

  trx_mod_tables_t mod_tables; /*!< List of tables that were modified

                               by this transaction */

#endif                         /* !UNIV_HOTBACKUP */

                               /*------------------------------*/

  bool api_trx;                /*!< trx started by InnoDB API */

  bool api_auto_commit;        /*!< automatic commit */

  bool read_write;             /*!< if read and write operation */

  /*------------------------------*/

  char *detailed_error;          /*!< detailed error message for last

                                 error, or empty. */

  FlushObserver *flush_observer; /*!< flush observer */

#ifdef UNIV_DEBUG

  bool is_dd_trx; /*!< True if the transaction is used for

                  doing Non-locking Read-only Read

                  Committed on DD tables */

#endif            /* UNIV_DEBUG */

  ulint magic_n;

  bool is_read_uncommitted() const {

    return (isolation_level == READ_UNCOMMITTED);

  }

  bool skip_gap_locks() const {

    switch (isolation_level) {

      case READ_UNCOMMITTED:

      case READ_COMMITTED:

        return (true);

      case REPEATABLE_READ:

      case SERIALIZABLE:

        return (false);

    }

    ut_ad(0);

    return (false);

  }

  bool allow_semi_consistent() const { return (skip_gap_locks()); }

};

上面太长不看,简化如下：

struct trx_t {

	...

	trx_lock_t lock;   /*!< Information about the transaction

                     locks and state. Protected by

                     trx->mutex or lock_sys->mutex

                     or both */

	...

}

在 8.0 中，将 lock 单独从 trx_t 中解耦出来。观察 trx_lock_t 的定义。

/** Latching protocol for trx_lock_t::que_state.  trx_lock_t::que_state

 captures the state of the query thread during the execution of a query.

 This is different from a transaction state. The query state of a transaction

 can be updated asynchronously by other threads.  The other threads can be

 system threads, like the timeout monitor thread or user threads executing

 other queries. Another thing to be mindful of is that there is a delay between

 when a query thread is put into LOCK_WAIT state and before it actually starts

 waiting.  Between these two events it is possible that the query thread is

 granted the lock it was waiting for, which implies that the state can be

 changed asynchronously.

 All these operations take place within the context of locking. Therefore state

 changes within the locking code must acquire both the lock mutex and the

 trx->mutex when changing trx->lock.que_state to TRX_QUE_LOCK_WAIT or

 trx->lock.wait_lock to non-NULL but when the lock wait ends it is sufficient

 to only acquire the trx->mutex.

 To query the state either of the mutexes is sufficient within the locking

 code and no mutex is required when the query thread is no longer waiting. */

/** The locks and state of an active transaction. Protected by

lock_sys->mutex, trx->mutex or both. */

struct trx_lock_t {

  ulint n_active_thrs; /*!< number of active query threads */

  trx_que_t que_state; /*!< valid when trx->state

                       == TRX_STATE_ACTIVE: TRX_QUE_RUNNING,

                       TRX_QUE_LOCK_WAIT, ... */

  lock_t *wait_lock;         /*!< if trx execution state is

                             TRX_QUE_LOCK_WAIT, this points to

                             the lock request, otherwise this is

                             NULL; set to non-NULL when holding

                             both trx->mutex and lock_sys->mutex;

                             set to NULL when holding

                             lock_sys->mutex; readers should

                             hold lock_sys->mutex, except when

                             they are holding trx->mutex and

                             wait_lock==NULL */

  ib_uint64_t deadlock_mark; /*!< A mark field that is initialized

                             to and checked against lock_mark_counter

                             by lock_deadlock_recursive(). */

  bool was_chosen_as_deadlock_victim;

  /*!< when the transaction decides to

  wait for a lock, it sets this to false;

  if another transaction chooses this

  transaction as a victim in deadlock

  resolution, it sets this to true.

  Protected by trx->mutex. */

  time_t wait_started; /*!< lock wait started at this time,

                       protected only by lock_sys->mutex */

  que_thr_t *wait_thr; /*!< query thread belonging to this

                       trx that is in QUE_THR_LOCK_WAIT

                       state. For threads suspended in a

                       lock wait, this is protected by

                       lock_sys->mutex. Otherwise, this may

                       only be modified by the thread that is

                       serving the running transaction. */

  lock_pool_t rec_pool; /*!< Pre-allocated record locks */

  lock_pool_t table_pool; /*!< Pre-allocated table locks */

  ulint rec_cached; /*!< Next free rec lock in pool */

  ulint table_cached; /*!< Next free table lock in pool */

  mem_heap_t *lock_heap; /*!< memory heap for trx_locks;

                         protected by lock_sys->mutex */

  trx_lock_list_t trx_locks; /*!< locks requested by the transaction;

                             insertions are protected by trx->mutex

                             and lock_sys->mutex; removals are

                             protected by lock_sys->mutex */

  lock_pool_t table_locks; /*!< All table locks requested by this

                           transaction, including AUTOINC locks */

  ulint n_rec_locks; /*!< number of rec locks in this trx */

#ifdef UNIV_DEBUG

  /** When a transaction is forced to rollback due to a deadlock

  check or by another high priority transaction this is true. Used

  by debug checks in lock0lock.cc */

  bool in_rollback;

#endif /* UNIV_DEBUG */

  /** The transaction called ha_innobase::start_stmt() to

  lock a table. Most likely a temporary table. */

  bool start_stmt;

};

typedef UT_LIST_BASE_NODE_T(lock_t) trx_lock_list_t;

trx_lock_list_t trx_locks; /*!< locks requested by the transaction;

                             insertions are protected by trx->mutex

                             and lock_sys->mutex; removals are

                             protected by lock_sys->mutex */

每个事务都会维护执行过程中加的所有锁（ trx_locks，双向链表），以及发生锁等待时等待的锁（ wait_lock ）

lock_sys 中保存的对象为 lock_t ，它可以描述一个行锁或者表锁，如果是行锁，会被 lock_sys 维护，若是表锁，会被对应的 dict_table_t 维护

/** Lock struct; protected by lock_sys->mutex */

struct lock_t {

  /** transaction owning the lock */

  trx_t *trx;

  /** list of the locks of the transaction */

  UT_LIST_NODE_T(lock_t) trx_locks;

  /** Index for a record lock */

  dict_index_t *index;

  /** Hash chain node for a record lock. The link node in a singly

  linked list, used by the hash table. */

  lock_t *hash;

  union {

    /** Table lock */

    lock_table_t tab_lock;

    /** Record lock */

    lock_rec_t rec_lock;

  };

#ifdef HAVE_PSI_THREAD_INTERFACE

#ifdef HAVE_PSI_DATA_LOCK_INTERFACE

  /** Performance schema thread that created the lock. */

  ulonglong m_psi_internal_thread_id;

  /** Performance schema event that created the lock. */

  ulonglong m_psi_event_id;

#endif /* HAVE_PSI_DATA_LOCK_INTERFACE */

#endif /* HAVE_PSI_THREAD_INTERFACE */

  /** The lock type and mode bit flags.

  LOCK_GAP or LOCK_REC_NOT_GAP, LOCK_INSERT_INTENTION, wait flag, ORed */

  uint32_t type_mode;

#if defined(UNIV_DEBUG)

  /** Timestamp when it was created. */

  uint64_t m_seq;

#endif /* UNIV_DEBUG */

  /** Remove GAP lock from a next Key Lock */

  void remove_gap_lock() {

    ut_ad(!is_gap());

    ut_ad(!is_insert_intention());

    ut_ad(is_record_lock());

    type_mode |= LOCK_REC_NOT_GAP;

  }

  /** Determine if the lock object is a record lock.

  @return true if record lock, false otherwise. */

  bool is_record_lock() const { return (type() == LOCK_REC); }

  /** Determine if it is predicate lock.

  @return true if predicate lock, false otherwise. */

  bool is_predicate() const {

    return (type_mode & (LOCK_PREDICATE | LOCK_PRDT_PAGE));

  }

  /** @return true if the lock wait flag is set */

  bool is_waiting() const { return (type_mode & LOCK_WAIT); }

  /** @return true if the gap lock bit is set */

  bool is_gap() const { return (type_mode & LOCK_GAP); }

  /** @return true if the not gap lock bit is set */

  bool is_record_not_gap() const { return (type_mode & LOCK_REC_NOT_GAP); }

  /** @return true if the insert intention bit is set */

  bool is_insert_intention() const {

    return (type_mode & LOCK_INSERT_INTENTION);

  }

  /** @return the lock mode */

  uint32_t type() const { return (type_mode & LOCK_TYPE_MASK); }

  /** @return the precise lock mode */

  lock_mode mode() const {

    return (static_cast<lock_mode>(type_mode & LOCK_MODE_MASK));

  }

  /** Get lock hash table

  @return lock hash table */

  hash_table_t *hash_table() const { return (lock_hash_get(type_mode)); }

  /** @return the record lock tablespace ID */

  space_id_t space_id() const {

    ut_ad(is_record_lock());

    return (rec_lock.space);

  }

  /** @return the record lock page number */

  page_no_t page_no() const {

    ut_ad(is_record_lock());

    return (rec_lock.page_no);

  }

  /** @return the transaction's query thread state. */

  trx_que_t trx_que_state() const { return (trx->lock.que_state); }

  /** Print the lock object into the given output stream.

  @param[in,out]	out	the output stream

  @return the given output stream. */

  std::ostream &print(std::ostream &out) const;

  /** Convert the member 'type_mode' into a human readable string.

  @return human readable string */

  std::string type_mode_string() const;

  /* @return the string/text representation of the record type. */

  const char *type_string() const {

    switch (type_mode & LOCK_TYPE_MASK) {

      case LOCK_REC:

        return ("LOCK_REC");

      case LOCK_TABLE:

        return ("LOCK_TABLE");

      default:

        ut_error;

    }

  }

};

不论行锁还是表锁，都记录了所属的事务，锁创建后会通过 trx_locks（next和prev指针）添加到所属事务已获得锁链表中（ trx_t.lock.trx_locks ），如果创建的锁为行锁，会使用（space_no , page_no ）经过hash函数计算后添加到 lock_sys 中。如果创建的锁为表锁，将其加入到对应dict_table_t的表锁链表尾部（lock_table_create）。

行锁与表锁

InnoDB 中一个事务中的行锁是按照 page 和锁类型组织，相同的锁类型，锁一行和锁这行所在page中所有行的存储代价是一样的，lock_rec_t 中的成员 nbits 后面有一个 bitmap ，bitmap 占用的存储空间为：(1 + (nbits-1)/8) bytes，即：lock_rec_t 占用的实际存储空间为：sizeof(lock_rec_t) + 1 + (nbits-1)/8，bitmap 中的每一位标识对应的行是否加锁

/** Record lock for a page */

struct lock_rec_t {

  space_id_t space;  /*!< space id */

  page_no_t page_no; /*!< page number */

  uint32_t n_bits;   /*!< number of bits in the lock

                     bitmap; NOTE: the lock bitmap is

                     placed immediately after the

                     lock struct */

  /** Print the record lock into the given output stream

  @param[in,out]	out	the output stream

  @return the given output stream. */

  std::ostream &print(std::ostream &out) const;

};

lock_table_t 表锁，保存了对应的 table 和 table上表锁链接 locks（prev和next指针）

/** A table lock */

struct lock_table_t {

  dict_table_t *table; /*!< database table in dictionary

                       cache */

  UT_LIST_NODE_T(lock_t)

  locks; /*!< list of locks on the same

         table */

  /** Print the table lock into the given output stream

  @param[in,out]	out	the output stream

  @return the given output stream. */

  std::ostream &print(std::ostream &out) const;

};

lock_t 表示一个锁，表锁或者行锁，其中的 trx_locks 成员会被链接到 trx_t 中的已加锁链表 trx_t.lock.trx_locks 中，而 lock_table_t 中的 locks 成员是需要链接到对应 table 后面，作用不同但容易混淆；lock_t中的 hash 和 index 成员放到 lock_rec_t 结构体中更加合适，可能是基于 lock_t 是行锁的可能性更大的前提考虑的

Wait-For-Graph在InnoDB中的实现

InnoDB 在实现 Wait-For-Graph 时基于性能方面的考虑，定义了两个变量：

LOCK_MAX_DEPTH_IN_DEADLOCK_CHECK (默认200)：深度优先遍历的层数超过此值，即认为发生死锁，回滚当前事务

LOCK_MAX_N_STEPS_IN_DEADLOCK_CHECK(默认1000000)：lock_deadlock_recursive递归调用的次数超过此值，即认为发生死锁，回滚当前事务

笔者测试的版本中，死锁检测单独为一个类 DeadlockChecker

class DeadlockChecker {

 public:

  /** Checks if a joining lock request results in a deadlock. If

  a deadlock is found this function will resolve the deadlock

  by choosing a victim transaction and rolling it back. It

  will attempt to resolve all deadlocks. The returned transaction

  id will be the joining transaction id or 0 if some other

  transaction was chosen as a victim and rolled back or no

  deadlock found.

  @param lock lock the transaction is requesting

  @param trx transaction requesting the lock

  @return id of transaction chosen as victim or 0 */

  static const trx_t *	(const lock_t *lock, trx_t *trx);

 private:

  /** Do a shallow copy. Default destructor OK.

  @param trx the start transaction (start node)

  @param wait_lock lock that a transaction wants

  @param mark_start visited node counter */

  DeadlockChecker(const trx_t *trx, const lock_t *wait_lock,

                  uint64_t mark_start)

      : m_cost(),

        m_start(trx),

        m_too_deep(),

        m_wait_lock(wait_lock),

        m_mark_start(mark_start),

        m_n_elems() {}

  /** Check if the search is too deep. */

  bool is_too_deep() const {

    return (m_n_elems > LOCK_MAX_DEPTH_IN_DEADLOCK_CHECK ||

            m_cost > LOCK_MAX_N_STEPS_IN_DEADLOCK_CHECK);

  }

  /** Save current state.

  @param lock lock to push on the stack.

  @param heap_no the heap number to push on the stack.

  @return false if stack is full. */

  bool push(const lock_t *lock, ulint heap_no) {

    ut_ad((lock_get_type_low(lock) & LOCK_REC) ||

          (lock_get_type_low(lock) & LOCK_TABLE));

    ut_ad(((lock_get_type_low(lock) & LOCK_TABLE) != 0) ==

          (heap_no == ULINT_UNDEFINED));

    /* Ensure that the stack is bounded. */

    if (m_n_elems >= UT_ARR_SIZE(s_states)) {

      return (false);

    }

    state_t &state = s_states[m_n_elems++];

    state.m_lock = lock;

    state.m_wait_lock = m_wait_lock;

    state.m_heap_no = heap_no;

    return (true);

  }

  /** Restore state.

  @param[out] lock current lock

  @param[out] heap_no current heap_no */

  void pop(const lock_t *&lock, ulint &heap_no) {

    ut_a(m_n_elems > 0);

    const state_t &state = s_states[--m_n_elems];

    lock = state.m_lock;

    heap_no = state.m_heap_no;

    m_wait_lock = state.m_wait_lock;

  }

  /** Check whether the node has been visited.

  @param lock lock to check

  @return true if the node has been visited */

  bool is_visited(const lock_t *lock) const {

    return (lock->trx->lock.deadlock_mark > m_mark_start);

  }

  /** Get the next lock in the queue that is owned by a transaction

  whose sub-tree has not already been searched.

  @param lock Lock in queue

  @param heap_no heap_no if lock is a record lock else ULINT_UNDEFINED

  @return next lock or NULL if at end of queue */

  const lock_t *get_next_lock(const lock_t *lock, ulint heap_no) const;

  /** Get the first lock to search. The search starts from the current

  wait_lock. What we are really interested in is an edge from the

  current wait_lock's owning transaction to another transaction that has

  a lock ahead in the queue. We skip locks where the owning transaction's

  sub-tree has already been searched.

  For record locks, we first position the iterator on first lock on

  the page and then reposition on the actual heap_no. This is required

  due to the way the record lock has is implemented.

  @param[out] heap_no if rec lock, else ULINT_UNDEFINED.

  @return first lock or NULL */

  const lock_t *get_first_lock(ulint *heap_no) const;

  /** Notify that a deadlock has been detected and print the conflicting

  transaction info.

  @param lock lock causing deadlock */

  void notify(const lock_t *lock) const;

  /** Select the victim transaction that should be rolledback.

  @return victim transaction */

  const trx_t *select_victim() const;

  /** Rollback transaction selected as the victim. */

  void trx_rollback();

  /** Looks iteratively for a deadlock. Note: the joining transaction

  may have been granted its lock by the deadlock checks.

  @return 0 if no deadlock else the victim transaction.*/

  const trx_t *search();

#ifdef UNIV_DEBUG

  /** Determines if a situation in which the lock takes part in a deadlock

  cycle is expected (as in: handled correctly) or not (say because it is on a DD

  table, for which there is no reason to expect a deadlock and we don't handle

  deadlocks correctly). The purpose of the function is to use it in an assertion

  failing as soon as the deadlock is identified, to give developer a chance to

  investigate the root cause of the situation (without such assertion, the code

  might continue to run and either fail at later stage when the data useful for

  debugging is no longer on stack, or not fail at all, which is risky).

  @param[in] lock lock found in a deadlock cycle

  @return true if we expect that this lock can take part in a deadlock cycle */

  static bool is_allowed_to_be_on_cycle(const lock_t *lock);

#endif /* UNIV_DEBUG */

  /** Print transaction data to the deadlock file and possibly to stderr.

  @param trx transaction

  @param max_query_len max query length to print */

  static void print(const trx_t *trx, ulint max_query_len);

  /** rewind(3) the file used for storing the latest detected deadlock

  and print a heading message to stderr if printing of all deadlocks to

  stderr is enabled. */

  static void start_print();

  /** Print lock data to the deadlock file and possibly to stderr.

  @param lock record or table type lock */

  static void print(const lock_t *lock);

  /** Print a message to the deadlock file and possibly to stderr.

  @param msg message to print */

  static void print(const char *msg);

  /** Print info about transaction that was rolled back.

  @param trx transaction rolled back

  @param lock lock trx wants */

  static void rollback_print(const trx_t *trx, const lock_t *lock);

 private:

  /** DFS state information, used during deadlock checking. */

  struct state_t {

    const lock_t *m_lock;      /*!< Current lock */

    const lock_t *m_wait_lock; /*!< Waiting for lock */

    ulint m_heap_no;           /*!< heap number if rec lock */

  };

  /** Used in deadlock tracking. Protected by lock_sys->mutex. */

  static uint64_t s_lock_mark_counter;

  /** Calculation steps thus far. It is the count of the nodes visited. */

  ulint m_cost;

  /** Joining transaction that is requesting a lock in an

  incompatible mode */

  const trx_t *m_start;

  /** true if search was too deep and was aborted */

  bool m_too_deep;

  /** Lock that trx wants */

  const lock_t *m_wait_lock;

  /**  Value of lock_mark_count at the start of the deadlock check. */

  uint64_t m_mark_start;

  /** Number of states pushed onto the stack */

  size_t m_n_elems;

  /** This is to avoid malloc/free calls. */

  static state_t s_states[MAX_STACK_SIZE];

};

/** Looks iteratively for a deadlock. Note: the joining transaction may

have been granted its lock by the deadlock checks.

@return 0 if no deadlock else the victim transaction instance.*/

const trx_t *DeadlockChecker::search() {

  ut_ad(lock_mutex_own());

  ut_ad(!trx_mutex_own(m_start));

  ut_ad(m_start != NULL);

  ut_ad(m_wait_lock != NULL);

  check_trx_state(m_wait_lock->trx);

  ut_ad(m_mark_start <= s_lock_mark_counter);

  /* Look at the locks ahead of wait_lock in the lock queue. */

  ulint heap_no;

  const lock_t *lock = get_first_lock(&heap_no);

  for (;;) {

    /* We should never visit the same sub-tree more than once. */

    ut_ad(lock == NULL || !is_visited(lock));

    while (m_n_elems > 0 && lock == NULL) {

      /* Restore previous search state. */

      pop(lock, heap_no);

      lock = get_next_lock(lock, heap_no);

    }

    if (lock == NULL) {

      break;

    } else if (lock == m_wait_lock) {

      /* We can mark this subtree as searched */

      ut_ad(lock->trx->lock.deadlock_mark <= m_mark_start);

      lock->trx->lock.deadlock_mark = ++s_lock_mark_counter;

      /* We are not prepared for an overflow. This 64-bit

      counter should never wrap around. At 10^9 increments

      per second, it would take 10^3 years of uptime. */

      ut_ad(s_lock_mark_counter > 0);

      /* Backtrack */

      lock = NULL;

    } else if (!lock_has_to_wait(m_wait_lock, lock)) {

      /* No conflict, next lock */

      lock = get_next_lock(lock, heap_no);

    } else if (lock->trx == m_start) {

      /* Found a cycle. */

      notify(lock);

      /* We don't expect deadlocks with most DD tables and all SDI tables.

      If we find, we crash early to find the transactions causing deadlock */

      ut_ad(is_allowed_to_be_on_cycle(lock));

      ut_ad(is_allowed_to_be_on_cycle(m_wait_lock));

      return (select_victim());

    } else if (is_too_deep()) {

      /* Search too deep to continue. */

      m_too_deep = true;

      return (m_start);

    } else if (lock->trx_que_state() == TRX_QUE_LOCK_WAIT) {

      /* Another trx ahead has requested a lock in an

      incompatible mode, and is itself waiting for a lock. */

      ++m_cost;

      if (!push(lock, heap_no)) {

        m_too_deep = true;

        return (m_start);

      }

      m_wait_lock = lock->trx->lock.wait_lock;

      lock = get_first_lock(&heap_no);

      if (is_visited(lock)) {

        lock = get_next_lock(lock, heap_no);

      }

    } else {

      lock = get_next_lock(lock, heap_no);

    }

  }

  ut_a(lock == NULL && m_n_elems == 0);

  /* No deadlock found. */

  return (0);

}

InnoDB通常在哪些情况下可能发生死锁

回顾一下cs课本上关于发生死锁的几个条件：

资源互斥；
请求保持；
不剥夺；
循环等待

对于DB而言，导致死锁意味着发生了循环等待，在InnoDB中由于行锁的引入，比较容易发生死锁，下面总结一些发生死锁的情况（不全）：

同一索引上，两个session相反的顺序加锁多行记录
Primary key和Secondary index，通过primary key找到记录，更新Secondary index字段与通过Secondary index更新记录
UPDATE/DELETE通过不同的二级索引更新多条记录，可能造成在Primary key上不同的加锁顺序