import snappy
import json
import os
from multiprocessing import Pool

d = None
# file_path = 'recover_couch_part.dat'
#file_path = 'recover_couch_part.dat'
file_path = 'recover_couch_slice.dat'
file_size = os.path.getsize(file_path)
process_count = 20

chunk_size_per_process = (int)(file_size/process_count)
chunk_count = (int)(file_size/chunk_size_per_process)+1

possible_match = []

window_half_size = 512


def search_buf_header(buf, offset):
    matches = []
    for i in range(0, len(buf)-4):
        if buf[i] == 0x01 and buf[i+1] == 0x00 and buf[i+2] == 0x00 and buf[i+3] == 0x00:
            matches.append(offset+i)
    return matches



def search_buf_reg(buf, offset):
    matches = []
    for i in range(0, len(buf)-3):
        if buf[i] == ord('R') and buf[i+1] == ord('E') and buf[i+2] == ord('G'):
            matches.append(offset+i)
    return matches


def search_buf_dir(buf, offset):
    matches = []
    for i in range(0, len(buf)-3):
        if buf[i] == ord('D') and buf[i+1] == ord('I') and buf[i+2] == ord('R'):
            matches.append(offset+i)
    return matches


def search_buf_key(buf, offset):
    matches = []
    for i in range(0, len(buf)-4):
        if buf[i] == ord('_') and buf[i+1] == ord('k') and buf[i+2] == ord('e'):
            matches.append(offset+i)
    for i in range(0, len(buf)-4):
        if buf[i] == ord('"') and buf[i+1] == ord('_') and buf[i+2] == ord('k'):
            matches.append(offset+i)
    return matches


def search_buf_seq(buf, offset):
    matches = []
    for i in range(0, len(buf)-4):
        if buf[i] == ord('_') and buf[i+1] == ord('s') and buf[i+2] == ord('e'):
            matches.append(offset+i)
    return matches


def search_buf_file_meta(buf, offset):
    matches = []
    for i in range(0, len(buf)-3):
        if buf[i] == ord('"') and buf[i+1] == ord('f') and buf[i+2] == ord('i'):
            matches.append(offset+i)
    return matches


def extract_document_around_offset(match_offset, raw_data):
    raw_data.seek(max(0, match_offset-window_half_size*2))
    match_buf = raw_data.read(window_half_size*4)

    possible_document_end_offsets = []
    for i in range((int)(len(match_buf)/2), len(match_buf)):
        if match_buf[i] == ord('}'):
            possible_document_end_offsets.append(i+1)

    if len(possible_document_end_offsets) == 0:
        return []

    #print(f'Possible end offsets are {possible_document_end_offsets}')

    documents_found = []
    for i in range(0, (int)(len(match_buf)/2)):
        for end_offset in possible_document_end_offsets:
            try:
                doc = snappy.decompress(match_buf[i:end_offset])
                #print(f'*** Found decompressed coument: {doc}')
                if doc[0] == ord('{'):
                    #print(f'!!! Found valid compressed document at offsets {[match_offset-2048+i, match_offset+end_offset]}')
                    documents_found.append(doc)
                    break
            except:
                pass
    return documents_found

def reg_to_csv(doc, csv_dump):
    try:
        d = json.loads(doc)
    except UnicodeDecodeError as e:
        print(f"Skipped invalid JSON document: {doc}")
        return
    except json.decoder.JSONDecodeError as e:
        print(f"Skipped invalid JSON document: {doc}")
        return

    #'{"_key":"997d7aba7d91a60d20ac9308ff2f902ach16fa","_scope":"a59d36cb355e69d8ab6ec04da68a09ebch9327","_mutators":["e1d3583037dd519a56c339f112ab4183ch52dc"],"_revs":["1-712260328528a443a6ec283c11276a43"],"_seq":2131,"_timestamp":1653498418,"_deleted":false,"_version":12,"_record":"file_meta","name":"mult.c","type":"REG","mode":436,"protection_flags":0,"acl":[],"owner":"adb473c035d2ffe0e30c18a064907670chd2ff","is_scope":false,"provider_id":"e1d3583037dd519a56c339f112ab4183ch52dc","shares":[],"deleted":false,"parent_uuid":"f3f2c4fb0e4ae6bbf9e4b256f416c550ch16fa","references":{},"symlink_value":null,"dataset_status":null}'
    #scope;file_id;type;parent_uuid;name;size;mode;timestamp;deleted;version;sequence

    if '_scope' not in d:
        return

    scope = d.get('_scope')
    key = d.get('_key', '')
    parent_uuid = d.get('parent_uuid', '')
    name = d.get('name', '')
    if '\t' in name:
        name = name.replace("\t", "___TABULATOR___")
    size = 0
    file_type = d.get('type', '')
    mode = d.get('mode', '')
    timestamp = d.get('_timestamp', '')
    deleted = d.get('_deleted', '')
    version = d.get('_version', '')
    seq = d.get('_seq', '')


    csv_dump.write(f'{scope}\t{key}\t{file_type}\t{parent_uuid}\t{name}\t0\t{mode}\t{timestamp}\t{deleted}\t{version}\t{seq}\n')

def dir_to_csv(doc, csv_dump):
    try:
        d = json.loads(doc)
    except UnicodeDecodeError as e:
        print(f"Skipped invalid JSON document: {doc}")
        return
    except json.decoder.JSONDecodeError as e:
        print(f"Skipped invalid JSON document: {doc}")
        return

    if '_scope' not in d:
        return

    scope = d.get('_scope')
    key = d.get('_key', '')
    parent_uuid = d.get('parent_uuid', '')
    name = d.get('name', '')
    if '\t' in name:
        name = name.replace("\t", "___TABULATOR___")
    size = 0
    file_type = d.get('type', '')
    mode = d.get('mode', '')
    timestamp = d.get('_timestamp', '')
    deleted = d.get('_deleted', '')
    version = d.get('_version', '')
    seq = d.get('_seq', '')

    csv_dump.write(f'{scope}\t{key}\t{file_type}\t{parent_uuid}\t{name}\t0\t{mode}\t{timestamp}\t{deleted}\t{version}\t{seq}\n')


def process_chunk(file_range):
    offset, range_end = file_range
    raw_data = open(file_path, 'rb')
    csv_dump = open(f'recovery-{offset}.csv', 'w')
    if offset == 0:
        csv_dump.write('#scope\tfile_id\ttype\tparent_uuid\tname\tsize\tmode\ttimestamp\tdeleted\tversion\tsequence\n')
    buf = None
    buf_size = 1024*1024*5
    report_counter = 0
    found_counter = 0
    raw_data.seek(offset)
    while offset < range_end:
        # print(f'Search at offset {offset}')
        buf = raw_data.read(buf_size)
        # Search for file 'REG' documents
        reg_matches = search_buf_reg(buf, offset)

        possible_docs = []
        reg_docs = []
        dir_docs = []

        if len(reg_matches) > 0:
            #print(f'Found REG matches at offsets: {reg_matches}')
            for match_offset in reg_matches:
                docs = extract_document_around_offset(match_offset, raw_data)
                if docs:
                    possible_docs.extend(docs)

        # Search for file 'DIR' documents
        dir_matches = search_buf_dir(buf, offset)
        if len(dir_matches) > 0:
            #print(f'Found DIR matches at offsets: {dir_matches}')
            for match_offset in dir_matches:
                docs = extract_document_around_offset(match_offset, raw_data)
                if docs:
                    possible_docs.extend(docs)

        # Search for file 'key' documents
        key_matches = search_buf_file_meta(buf, offset)
        if len(key_matches) > 0:
            #print(f'Found key matches at offsets: {key_matches}')
            for match_offset in key_matches:
                docs = extract_document_around_offset(match_offset, raw_data)
                if docs:
                    possible_docs.extend(docs)


        for doc in possible_docs:
            if b'REG' in doc:
                #print(f"### Found REG document {doc}")
                reg_docs.append(doc)
                found_counter += 1
            elif b'DIR' in doc:
                #print(f"### Found DIR document {doc}")
                dir_docs.append(doc)
                found_counter += 1
            else:
                pass

        # Convert REG docs to CSV
        for reg_doc in reg_docs:
            reg_to_csv(reg_doc, csv_dump)

        # Convert DIR docs to CSV
        for dir_doc in dir_docs:
            dir_to_csv(dir_doc, csv_dump)

        csv_dump.flush()

        offset += buf_size - window_half_size
        raw_data.seek(offset)

        report_counter += 1

        if report_counter == 10:
            print(f"Found {found_counter} records in total in range {file_range} [{(offset-file_range[0])/(file_range[1]-file_range[0])*100}%]")
            report_counter = 0


    raw_data.close()
    csv_dump.close()

    return found_counter

if __name__ == '__main__':
    with Pool(process_count) as p:
        chunks = [(start*(chunk_size_per_process-2048), min(file_size, (start+1)*chunk_size_per_process)) for start in range(0, chunk_count-1)]
        print(f'Processing file in parallel in chunks: {chunks}')
        found = p.map(process_chunk, chunks)
        print(f'Completed - found {found} in searched ranges')
        print(f'Total {sum(found)} records were found')

