Gideon Visser - Edit Wiki

Color

TQDM progress bars
RICH menus etc
PANDOC latex to PDF

Multi IO handling

https://docs.python.org/3/library/concurrent.futures.html

import time
import requests
import concurrent.futures


def get_wiki_page_existence(wiki_page_url, timeout=10):
    response = requests.get(url=wiki_page_url, timeout=timeout)

    page_status = "unknown"
    if response.status_code == 200:
        page_status = "exists"
    elif response.status_code == 404:
        page_status = "does not exist"

    time.sleep(3)

    return wiki_page_url + " - " + page_status
wiki_page_urls = ["https://en.wikipedia.org/wiki/" + str(i) for i in range(50)]

print("Running threaded:")
threaded_start = time.time()
with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = []
    for url in wiki_page_urls:
        futures.append(executor.submit(get_wiki_page_existence, wiki_page_url=url))
    count = 1
    for future in concurrent.futures.as_completed(futures):
        print(future.result())
        print(count)
        count += 1
print("Threaded time:", time.time() - threaded_start)

Multi Core Processing

import time
import multiprocessing as mp


def multiprocessing_func(x):
    time.sleep(3)
    print('{} done'.format(x))
    
if __name__ == '__main__':
    cpu_c = mp.cpu_count()
    starttime = time.time()
    pool = mp.Pool()
    list_of_lists = [i for i in range(0,cpu_c)]
    pool.map(multiprocessing_func, list_of_lists)
    print("next")
    pool.map(multiprocessing_func, list_of_lists)
    pool.close()
    print('That took {} seconds'.format(time.time() - starttime))

Compare

import concurrent.futures
import datetime
import hashlib
import multiprocessing as mp
import os
import random
import shutil
import time


PATH = r'F:\Audio\processed'


def create_checksum(file_path):
   hash = hashlib.md5()
   with open(file_path, "rb") as f:
       for chunk in iter(lambda: f.read(4096), b""):
           hash.update(chunk)
   return(hash.hexdigest())
   

def hash_file(file_path):
   #time.sleep(3)

   return(file_path, create_checksum(file_path))


def create_chunks(list_name, n):
   for i in range(0, len(list_name), n):
       yield list_name[i:i + n]


if __name__ == "__main__":
   begin_time = datetime.datetime.now()
   print(begin_time)
   all_paths = {}
   count = 0
   for root, dirs, files in os.walk(PATH):
       for file in files:
           all_paths[os.path.join(root, file)] = True
           if len(all_paths) > 1000:
               break
       else:
           continue
       break

   print(len(all_paths))
   done_time = datetime.datetime.now()
   print(done_time)

   _start = time.time()
   all_hashed = {}
   for file_p in all_paths:
       filepath, hash_of_file = hash_file(file_p)
       all_hashed[filepath] = hash_of_file
       #print(count)
       count += 1
   print("Loop took: {}".format(time.time() - _start))

   # print(all_hashed)

   confirmation = ''
   while confirmation not in ['Y', 'N']:
       confirmation = input('Continue? (Y/N): ').upper()
       
   if confirmation == 'N':
       sys.exit()


   _start = time.time()
   all_hashed = {}
   with concurrent.futures.ThreadPoolExecutor() as executor:
       futures = []
       for filepath in all_paths:
           futures.append(executor.submit(hash_file, file_path=filepath))

       count = 1
       for future in concurrent.futures.as_completed(futures):
           filepath, hash_of_file = future.result()
           all_hashed[filepath] = hash_of_file
           #print(count)
           count += 1

   print("ThreadPoolExecutor took: {}".format(time.time() - _start))

   # print(all_hashed)

   confirmation = ''
   while confirmation not in ['Y', 'N']:
       confirmation = input('Continue? (Y/N): ').upper()
       
   if confirmation == 'N':
       sys.exit()

   _start = time.time()
   cpu_c = mp.cpu_count()
   pool = mp.Pool()
   all_hashed = {}
   for list_of_files in create_chunks(list(all_paths), cpu_c):
       #print(len(list_of_files))
       #print(cpu_c)
       pool.map(hash_file, list_of_files)
       hashes = pool.map(hash_file, list_of_files)
       for file_path, hash_of_file in hashes:
           all_hashed[file_path] = hash_of_file
           #print(count)
           #count += 1

   pool.close()
   print("Multiprocessing took: {}".format(time.time() - _start))

   #print(all_hashed)