1
Fork 0
similarity-sandbox/gzip_similarity.py

28 lines
750 B
Python

import json
import gzip
def main():
with open("pages.json") as f:
pages = json.load(f)
for title, content in pages.items():
print(f"Checking '{title}'")
other_pages = pages.copy()
del other_pages[title]
compressed_content = gzip.compress(content.encode())
print("\tCompressed size:", len(compressed_content))
other_pages_compressed = {
other_title: len(gzip.compress((content + other_content).encode())) - len(compressed_content)
for other_title, other_content in other_pages.items()
}
similar_pages = sorted(other_pages_compressed.items(), key=lambda i: i[1])
print("\t", similar_pages[:3])
if __name__ == "__main__":
main()