From 5273c5e5a29e7dfa207fd02d271eda9b64df4d3b Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 15 Nov 2023 18:03:45 +0800 Subject: [PATCH] test --- .github/workflows/run.yml | 24 +++++------ resouces/direct-need-to-remove.txt | 35 ++++++++++++++++ resouces/direct.txt | 0 resouces/findRedundantDomain.py | 67 ++++++++++++++++++++++++++++++ resouces/proxy-need-to-remove.txt | 3 ++ resouces/proxy.txt | 2 + resouces/reject-need-to-remove.txt | 33 +++++++++++++++ resouces/reject.txt | 0 resouces/removeFrom.py | 29 +++++++++++++ 9 files changed, 180 insertions(+), 13 deletions(-) create mode 100644 resouces/direct-need-to-remove.txt create mode 100644 resouces/direct.txt create mode 100644 resouces/findRedundantDomain.py create mode 100644 resouces/proxy-need-to-remove.txt create mode 100644 resouces/proxy.txt create mode 100644 resouces/reject-need-to-remove.txt create mode 100644 resouces/reject.txt create mode 100644 resouces/removeFrom.py diff --git a/.github/workflows/run.yml b/.github/workflows/run.yml index 8bbe2e2c..b3b8c21e 100644 --- a/.github/workflows/run.yml +++ b/.github/workflows/run.yml @@ -6,7 +6,6 @@ on: push: branches: - master - - hidden paths-ignore: - "**/README.md" jobs: @@ -27,11 +26,10 @@ jobs: echo "WIN_EXTRA=https://raw.githubusercontent.com/crazy-max/WindowsSpyBlocker/master/data/hosts/extra.txt" >> $GITHUB_ENV shell: bash - - name: Checkout the "hidden" branch + - name: Checkout uses: actions/checkout@v3 with: repository: MetaCubeX/meta-rules-dat - ref: hidden - name: Checkout Loyalsoldier/domain-list-custom uses: actions/checkout@v3 @@ -84,11 +82,11 @@ jobs: run: | # curl -sSL ${CUSTOM_DIRECT} | grep -v google | grep -v manhua | grep -v ooklaserver | grep -v "acg.rip" | perl -ne '/^((full|regexp|keyword):[^:]+)(\n$|:@.+)/ && print "$1\n"' | sort --ignore-case -u > direct-reserve.txt curl -sSL ${CUSTOM_PROXY} | grep -Ev ":@cn" | perl -ne '/^((full|regexp|keyword):[^:]+)(\n$|:@.+)/ && print "$1\n"' | sort --ignore-case -u > proxy-reserve.txt - + - name: Add proxy, direct and reject domains from "hidden" branch to appropriate temp files run: | - cat proxy.txt >> temp-proxy.txt - cat direct.txt >> temp-direct.txt + cat ./resouces/proxy.txt >> temp-proxy.txt + cat ./resouces/direct.txt >> temp-direct.txt # cat reject.txt >> temp-reject.txt - name: Sort and generate redundant lists @@ -99,20 +97,20 @@ jobs: - name: Remove redundant domains run: | - chmod +x *.py - python ./findRedundantDomain.py ./direct-list-with-redundant ./direct-list-deleted-unsort - python ./findRedundantDomain.py ./proxy-list-with-redundant ./proxy-list-deleted-unsort + chmod +x ./resouces/*.py + python ./resouces/findRedundantDomain.py ./direct-list-with-redundant ./direct-list-deleted-unsort + python ./resouces/findRedundantDomain.py ./proxy-list-with-redundant ./proxy-list-deleted-unsort [ ! -f "direct-list-deleted-unsort" ] && touch direct-list-deleted-unsort [ ! -f "proxy-list-deleted-unsort" ] && touch proxy-list-deleted-unsort sort ./direct-list-deleted-unsort > ./direct-list-deleted-sort sort ./proxy-list-deleted-unsort > ./proxy-list-deleted-sort - python ./removeFrom.py -remove ./direct-list-deleted-sort -from ./direct-list-with-redundant -out direct-list-without-redundant - python ./removeFrom.py -remove ./proxy-list-deleted-sort -from ./proxy-list-with-redundant -out proxy-list-without-redundant + python ./resouces/removeFrom.py -remove ./direct-list-deleted-sort -from ./direct-list-with-redundant -out direct-list-without-redundant + python ./resouces/removeFrom.py -remove ./proxy-list-deleted-sort -from ./proxy-list-with-redundant -out proxy-list-without-redundant - name: Remove domains from "need-to-remove" lists in "hidden" branch run: | - python ./removeFrom.py -remove ./direct-need-to-remove.txt -from ./direct-list-without-redundant -out temp-cn.txt - python ./removeFrom.py -remove ./proxy-need-to-remove.txt -from ./proxy-list-without-redundant -out temp-geolocation-\!cn.txt + python ./resouces/removeFrom.py -remove ./resouces/direct-need-to-remove.txt -from ./direct-list-without-redundant -out ./temp-cn.txt + python ./resouces/removeFrom.py -remove ./resouces/proxy-need-to-remove.txt -from ./proxy-list-without-redundant -out ./temp-geolocation-\!cn.txt - name: Remove domains end with ".cn" in "temp-geolocation-!cn.txt" and write lists to data directory run: | diff --git a/resouces/direct-need-to-remove.txt b/resouces/direct-need-to-remove.txt new file mode 100644 index 00000000..25074f69 --- /dev/null +++ b/resouces/direct-need-to-remove.txt @@ -0,0 +1,35 @@ +103.com +123cha.com +95081.com +airasia.com +baid.us +baidu.jp +bussou.com +busytrade.com +cnbeta.com +cnbetacdn.com +cnpolitics.org +dm530.net +duanzhihu.com +dysfz.cc +emacs-china.org +galaxymacau.com +galstars.net +haitum.com +hostloc.com +jiaoyou8.com +kh.google.com +laonanren.com +mysinablog.com +ntrqq.com +nytlog.com +shuangtv.net +suppig.net +top +xclient.info +xjp.cc +yanghengjun.com +ydy.com +yslang.com +yysub.net +hamreus.com diff --git a/resouces/direct.txt b/resouces/direct.txt new file mode 100644 index 00000000..e69de29b diff --git a/resouces/findRedundantDomain.py b/resouces/findRedundantDomain.py new file mode 100644 index 00000000..9db1a7ff --- /dev/null +++ b/resouces/findRedundantDomain.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 + +import sys +print(sys.argv[1], sys.argv[2]) + +''' Find redundant items in domain lists. + e.g. 'bar.foo.com' is redundant for 'foo.com'. +''' + +def load(list): + ''' Parse conf file & Prepare data structure + Returns: [ ['abc', 'com'], + ['bar', 'foo', 'com'], + ... ] + ''' + + results = [] + with open(list, 'r') as f: + for line in f.readlines(): + line = line.strip() + if line == '' or line.startswith('#'): + continue + # A domain name is case-insensitive and + # consists of several labels, separated by a full stop + domain_labels = line.lower().split('.') + results.append(domain_labels) + + # Sort results by domain labels' length + results.sort(key=len) + return results + +def find(labelses, removedDomainFile): + ''' Find redundant items by a tree of top-level domain label to sub-level. + `tree` is like { 'com': { 'foo: { 'bar': LEAF }, + 'abc': LEAF }, + 'org': ... } + ''' + + tree = {} + LEAF = 1 + for labels in labelses: + domain = '.'.join(labels) + # Init root node as current node + node = tree + while len(labels) > 0: + label = labels.pop() + if label in node: + # If child node is a LEAF node, + # current domain must be an existed domain or a subdomain of an existed. + if node[label] == LEAF: + print(f"Redundant found: {domain} at {'.'.join(labels)}") + with open(removedDomainFile, "a") as f: + f.write(domain) + f.write("\n") + break + else: + # Create a leaf node if current label is last one + if len(labels) == 0: + node[label] = LEAF + # Create a branch node + else: + node[label] = {} + # Iterate to child node + node = node[label] + +if __name__ == '__main__': + find(load(sys.argv[1]), sys.argv[2]) diff --git a/resouces/proxy-need-to-remove.txt b/resouces/proxy-need-to-remove.txt new file mode 100644 index 00000000..0ec95867 --- /dev/null +++ b/resouces/proxy-need-to-remove.txt @@ -0,0 +1,3 @@ +ifanr.com +weibo.com +www.baidu.com diff --git a/resouces/proxy.txt b/resouces/proxy.txt new file mode 100644 index 00000000..a941b166 --- /dev/null +++ b/resouces/proxy.txt @@ -0,0 +1,2 @@ +supertop.co +hk.chinamobile.com diff --git a/resouces/reject-need-to-remove.txt b/resouces/reject-need-to-remove.txt new file mode 100644 index 00000000..6a544828 --- /dev/null +++ b/resouces/reject-need-to-remove.txt @@ -0,0 +1,33 @@ +4paradigm.com +addthis.com +addthisedge.com +alimama.alicdn.com +alimama.com +analytics.google.com +app.chat.xiaomi.net +bdtj.tagtic.cn +cdn.onesignal.com +click.discord.com +click.redditmail.com +ctrip.com +d.ifengimg.com +icons.mydrivers.com +img.alibaba.com +jav321.com +knet.cn +mail.tsinghua.edu.cn +mtalk.google.com +mx.technolutions.net +newrelic.com +offer.alibaba.com +pingjs.qq.com +qlogo.cn +resolver.msg.xiaomi.net +s.youtube.com +sf3-ttcdn-tos.pstatp.com +t.co +tagtic.cn +telegra.ph +tongji.baidu.com +tv.sohu.com +ue.yeyoucdn.com diff --git a/resouces/reject.txt b/resouces/reject.txt new file mode 100644 index 00000000..e69de29b diff --git a/resouces/removeFrom.py b/resouces/removeFrom.py new file mode 100644 index 00000000..e4f8ae80 --- /dev/null +++ b/resouces/removeFrom.py @@ -0,0 +1,29 @@ +import argparse + + +def remove_domains(file_to_remove, file_to_remove_from, output_file): + with open(file_to_remove, "r") as f_remove, open( + file_to_remove_from, "r" + ) as f_from: + domains_to_remove = set(line.strip() for line in f_remove) + all_domains = set(line.strip() for line in f_from) + + remaining_domains = all_domains - domains_to_remove + + with open(output_file, "w") as output: + output.write("\n".join(remaining_domains)) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Remove domains from a file.") + parser.add_argument( + "-remove", required=True, help="File containing domains to be removed" + ) + parser.add_argument( + "-from", required=True, dest="from_file", help="File to remove domains from" + ) + parser.add_argument("-out", required=True, help="Output file") + + args = parser.parse_args() + + remove_domains(args.remove, args.from_file, args.out)