基本数据结构的 Python 实现及应用

一、内置数据结构的性能

List

Python 内置的 List 类型针对不同操作的性能如下：

Operation	Big-O
index	O(1)
index 赋值	O(1)
append	O(1)
pop()	O(1)
pop(i)	O(n)
insert(i, item)	O(n)
del	O(n)
iteration	O(n)
contains (in)	O(n)
slice [x:y]	O(k)
del slice	O(n)
set slice	O(n+k)
reverse	O(n)
concatenate	O(k)
sort	O(n log n)
multiply	O(nk)

几种列表初始化方式的运行时间对比（ concatenate > append > comprehension > list range）：

from timeit import Timer

# concatenate
def test1():
    l = []
    for i in range(1000):
        l = l + [i]

# append
def test2():
    l = []
    for i in range(1000):
        l.append(i)

# comprehension
def test3():
    l = [ i for i in range(1000)]

# list range
def test4():
    l = list(range(1000))


t1 = Timer("test1()", "from __main__ import test1")
print(f"concat {t1.timeit(number=1000)} milliseconds")

t2 = Timer("test2()", "from __main__ import test2")
print(f"append {t2.timeit(number=1000)} milliseconds")

t3 = Timer("test3()", "from __main__ import test3")
print(f"comprehension {t3.timeit(number=1000)} milliseconds")

t4 = Timer("test4()", "from __main__ import test4")
print(f"list range {t4.timeit(number=1000)} milliseconds")

# concat 1.3573799000000002 milliseconds
# append 0.0650925 milliseconds
# comprehension 0.03262469999999995 milliseconds
# list range 0.01332690000000003 milliseconds

列表的 pop(0) 方法和 pop() 方法的运行时间对比（pop(0) 为 O(n)，pop() 为 O(1)）：

from timeit import Timer
from matplotlib import pyplot as plt

pop_zero = Timer("x.pop(0)", "from __main__ import x")
pop_end = Timer("x.pop()", "from __main__ import x")
X, Ypt, Ypz = [], [], []

for i in range(10000, 500001, 10000):
    x = list(range(i))
    pt = pop_end.timeit(number=1000)
    pz = pop_zero.timeit(number=1000)

    X.append(i)
    Ypt.append(pt)
    Ypz.append(pz)


plt.scatter(X, Ypt, c='m', marker='^', label='pop_end')
plt.scatter(X, Ypz, c='y', label='pop_zero')
plt.legend()
plt.show()

pop(0) vs pop()

Dictionary

Operation	Big-O
copy	O(n)
get item	O(1)
set item	O(1)
delete item	O(1)
contains (in)	O(1)
iteration	O(n)

列表与字典 contains 操作的运行时间对比（列表为 O(n)，字典为 O(1)）：

import timeit
import random
from matplotlib import pyplot as plt


X, Ylist, Ydict = [], [], []
for i in range(10000, 1000001, 20000):
    t = timeit.Timer(f"random.randrange({i}) in x",
            "from __main__ import random, x")

    x = list(range(i))
    list_time = t.timeit(number=1000)

    x = {j: None for j in range(i)}
    dict_time = t.timeit(number=1000)

    X.append(i)
    Ylist.append(list_time)
    Ydict.append(dict_time)

plt.scatter(X, Ylist, c='m', marker='^', label='list')
plt.scatter(X, Ydict, c='y', label='dict')
plt.legend()
plt.show()

list vs dict

二、Stack

LIFO（last-in first-out），从顶部添加新项目，移除项目也从顶部开始。即越是最新添加到栈中的项目越是接近栈的顶部位置，同时也最先出栈。
类似于在厨房里刷盘子，洗好的盘子摞在顶部，使用时也最先拿顶部（最近洗好）的干净盘子。

# stack.py
class Stack:
    def __init__(self):
        self.items = []

    def is_empty(self):
        return self.items == []

    def push(self, item):
        self.items.append(item)

    def pop(self):
        return self.items.pop()

    def peek(self):
        return self.items[len(self.items) - 1]

    def size(self):
        return len(self.items)


if __name__ == '__main__':
    s = Stack()

    print(s.is_empty())  # True
    s.push(4)
    s.push('dog')
    print(s.peek())  # dog
    s.push(True)
    print(s.size())  # 3
    s.push(8.4)
    print(s.pop())  # 8.4
    print(s.pop())  # True
    print(s.size())  # 2

进制转换

通过短除法将十进制数转换为其他进制：

from stack import Stack

def base_converter(dec_number, base):
    digits = "0123456789ABCDEF"

    rem_stack = Stack()

    while dec_number > 0:
        rem = dec_number % base
        rem_stack.push(rem)
        dec_number = dec_number // base

    new_string = ""
    while not rem_stack.is_empty():
        new_string = new_string + digits[rem_stack.pop()]

    return new_string

print(base_converter(28, 2))  # 11100
print(base_converter(28, 16))  # 1C

短除法每次循环生成的余数从后往前拼接得到最终的目标进制数字。即最先生成的余数为目标进制数字的最后一位，最后生成的余数为目标进制数字的第一位，后进先出。
to binary

括号匹配

from stack import Stack

def par_checker(symbol_string):
    s = Stack()
    balanced = True
    index = 0
    while index < len(symbol_string) and balanced:
        symbol = symbol_string[index]
        if symbol in "([{":
            s.push(symbol)
        else:
            if s.is_empty():
                balanced = False
            else:
                top = s.pop()
                if not matches(top, symbol):
                    balanced = False
        index += 1
    if balanced and s.is_empty():
        return True
    else:
        return False

def matches(open, close):
    opens = "([{"
    closes = ")]}"
    return opens.index(open) == closes.index(close)


print(par_checker('{{([][])}()}'))  # True
print(par_checker('[{()]'))  # False

遇到左半边括号（([{）就 push 到 Stack 中，遇到右半边括号（)]}）就从 Stack 中 pop 出一个左半边括号进行配对。
最先遇到的右半边括号肯定需要与最后放入 Stack 中的左半边括号匹配（因此使用 Stack 的 pop 取数据）。最终 Stack 为空，则括号全部匹配。
par_checker

三、Queues & Deques

FIFO（first-in first-out），从队列一端（rear）添加新项目，从另一端（front）弹出项目。
类似于生活中的排队点餐，新来的顾客排到队伍最后面，队伍最前面的人优先点餐和付款然后离开队伍。
deque 为双向队列。

# queues.py
class Queue:
    def __init__(self):
        self.items = []

    def is_empty(self):
        return self.items == []

    def enqueue(self, item):
        self.items.insert(0, item)

    def dequeue(self):
        return self.items.pop()

    def size(self):
        return len(self.items)


class Deque:
    def __init__(self):
        self.items = []

    def is_empty(self):
        return self.items == []

    def add_front(self, item):
        self.items.append(item)

    def add_rear(self, item):
        self.items.insert(0, item)

    def remove_front(self):
        return self.items.pop()

    def remove_rear(self):
        return self.items.pop(0)

    def size(self):
        return len(self.items)

Palindrome Checker

Palindrome 是指左右对称的单词，如 radar、toot、madam 等。
双向队列 Deque 从队列两端弹出数据，可用于检测目标单词首尾两端的字符是否一一对应，即是否对称。

from queues import Deque

def pal_checker(a_string):
    char_deque = Deque()

    for ch in a_string:
        char_deque.add_rear(ch)

    still_equal = True

    while char_deque.size() > 1 and still_equal:
        first = char_deque.remove_front()
        last = char_deque.remove_rear()

        if first != last:
            still_equal = False

    return still_equal

print(pal_checker("lsdkjfskf"))  #False
print(pal_checker("radar"))  # True

radar

四、Sort

Bubble Sort

def bubble_sort(a_list):
    for pass_num in range(len(a_list) - 1, 0, -1):
        for i in range(pass_num):
            if a_list[i] > a_list[i + 1]:
                a_list[i], a_list[i + 1] = a_list[i + 1], a_list[i]

a_list = [54, 26, 93, 17, 77, 31, 44, 55, 20]
bubble_sort(a_list)
print(a_list)
# => [17, 20, 26, 31, 44, 54, 55, 77, 93]

Bubble Sort 会依次比较序列中相邻两个数字的大小关系，必要时交换位置，保证较大的数字排在另一个数字右侧。整个序列执行完一轮完整的两两对比之后，可以保证最大的数字位于序列的最右侧。
下一轮则可以将第二大的数字交换到序列中倒数第二的位置，依次类推。

Bubble Sort 通常认为是最低效的排序算法。复杂度为 O(n^2)。

Selection Sort

def selection_sort(a_list):
    for fill_slot in range(len(a_list) - 1, 0, -1):
        pos_of_max = 0
        for location in range(1, fill_slot + 1):
            if a_list[location] > a_list[pos_of_max]:
                pos_of_max = location
        a_list[pos_of_max], a_list[location] = a_list[location], a_list[pos_of_max]

a_list = [54, 26, 93, 17, 77, 31, 44, 55, 20]
selection_sort(a_list)
print(a_list)
# => [17, 20, 26, 31, 44, 54, 55, 77, 93]

Selection Sort 会在第一轮对所有数字的比较中，记录最大数字的位置，令其与序列中最后位置的数字互换。然后在第二轮比较中找出第二大的数字，与序列中倒数第二位置上的数字互换，依次类推。复杂度为 O(n^2)。
相对于 Bubble Sort，Selection Sort 减少了数字位置的交换次数，一轮只需要一次交换。

Insertion Sort

def insertion_sort(a_list):
    for index in range(1, len(a_list)):
        current_value = a_list[index]
        position = index

        while position > 0 and a_list[position - 1] > current_value:
            a_list[position] = a_list[position - 1]
            position = position - 1

        a_list[position] = current_value

a_list = [54, 26, 93, 17, 77, 31, 44, 55, 20]
insertion_sort(a_list)
print(a_list)
# => [17, 20, 26, 31, 44, 54, 55, 77, 93]

Insertion Sort 可以理解为在当前序列头部维护着一个长度不断增长的排序好的子序列，每从序列后半段取出一个数字，就根据其与头部序列中数字的大小关系，插入到头部序列的适当位置。

复杂度依旧为 O(n^2)。但通常情况下，shift 操作的性能一般要优于 exchange 操作。
5th pass

Shell Sort

Shell Sort 会从原来的待排序列表中以一定的间隔选取数字组成一系列小的子列表，再依次通过 Insertion Sort 方法进行排序。复杂度大概介于 O(n) 和 O(n^2) 之间。

def shell_sort(a_list):
    sublist_count = len(a_list) // 2
    while sublist_count > 0:
        for start_position in range(sublist_count):
            gap_insertion_sort(a_list, start_position, sublist_count)

        print("After increments of size", sublist_count, "The list is", a_list)
        sublist_count = sublist_count // 2

def gap_insertion_sort(a_list, start, gap):
    for i in range(start + gap, len(a_list), gap):
        current_value = a_list[i]
        position = i
        while position >= gap and a_list[position - gap] > current_value:
            a_list[position] = a_list[position - gap]
            position = position - gap

        a_list[position] = current_value

a_list = [54, 26, 93, 17, 77, 31, 44, 55, 20]
shell_sort(a_list)
print(a_list)
# => After increments of size 4 The list is [20, 26, 44, 17, 54, 31, 93, 55, 77]
# => After increments of size 2 The list is [20, 17, 44, 26, 54, 31, 77, 55, 93]
# => After increments of size 1 The list is [17, 20, 26, 31, 44, 54, 55, 77, 93]
# => [17, 20, 26, 31, 44, 54, 55, 77, 93]

A Shell Sort with Increments of 3
A Shell Sort after Sorting Each Sublist
A Final Insertion Sort with Increment of 1

Merge Sort

Merge Sort 是将原待排序列表递归地一分为二直到成为最小单位，然后在两两合并的过程中进行大小排序。合并操作完成后得到完整的排序好的列表。

Merge Sort 和后面的 Quick Sort 复杂度都为 O(n log n)。Merge Sort 在融合过程中需要使用额外的存储空间，而 Quick Sort 的复杂度有可能提高到 O(n^2) （当分割点不是接近列表的中间位置）。

def merge_sort(a_list):
    print("Splitting ", a_list)
    if len(a_list) > 1:
        mid = len(a_list) // 2
        left_half = a_list[:mid]
        right_half = a_list[mid:]

        merge_sort(left_half)
        merge_sort(right_half)

        i = 0
        j = 0
        k = 0

        while i < len(left_half) and j < len(right_half):
            if left_half[i] < right_half[j]:
                a_list[k] = left_half[i]
                i = i + 1
            else:
                a_list[k] = right_half[j]
                j = j + 1
            k = k + 1

        while i < len(left_half):
            a_list[k] = left_half[i]
            i = i + 1
            k = k + 1

        while j < len(right_half):
            a_list[k] = right_half[j]
            j = j + 1
            k = k + 1
    print("Merging ", a_list)

a_list = [54, 26, 93, 17, 77, 31, 44, 55, 20]
merge_sort(a_list)
print(a_list)

Splitting the List in a Merge Sort
Lists as They Are Merged Together

Quick Sort

def quick_sort(a_list):
    quick_sort_helper(a_list, 0, len(a_list) - 1)

def quick_sort_helper(a_list, first, last):
    if first < last:
        split_point = partition(a_list, first, last)

        quick_sort_helper(a_list, first, split_point - 1)
        quick_sort_helper(a_list, split_point + 1, last)

def partition(a_list, first, last):
    pivot_value = a_list[first]

    left_mark = first + 1
    right_mark = last

    done = False
    while not done:
        while left_mark <= right_mark and \
                a_list[left_mark] <= pivot_value:
            left_mark = left_mark + 1

        while a_list[right_mark] >= pivot_value and \
                right_mark >= left_mark:
            right_mark = right_mark - 1

        if right_mark < left_mark:
            done = True
        else:
            temp = a_list[left_mark]
            a_list[left_mark] = a_list[right_mark]
            a_list[right_mark] = temp

    temp = a_list[first]
    a_list[first] = a_list[right_mark]
    a_list[right_mark] = temp

    return right_mark

a_list = [54, 26, 93, 17, 77, 31, 44, 55, 20]
quick_sort(a_list)
print(a_list)

Finding the Split Point for 54
Completing the Partition Process to Find the Split Point for 54

参考资料

Problem Solving with Algorithms and Data Structures using Python