This commit is contained in:
github-actions[bot] 2023-11-15 18:03:45 +08:00
parent 8026386706
commit 5273c5e5a2
9 changed files with 180 additions and 13 deletions

View File

@ -6,7 +6,6 @@ on:
push:
branches:
- master
- hidden
paths-ignore:
- "**/README.md"
jobs:
@ -27,11 +26,10 @@ jobs:
echo "WIN_EXTRA=https://raw.githubusercontent.com/crazy-max/WindowsSpyBlocker/master/data/hosts/extra.txt" >> $GITHUB_ENV
shell: bash
- name: Checkout the "hidden" branch
- name: Checkout
uses: actions/checkout@v3
with:
repository: MetaCubeX/meta-rules-dat
ref: hidden
- name: Checkout Loyalsoldier/domain-list-custom
uses: actions/checkout@v3
@ -87,8 +85,8 @@ jobs:
- name: Add proxy, direct and reject domains from "hidden" branch to appropriate temp files
run: |
cat proxy.txt >> temp-proxy.txt
cat direct.txt >> temp-direct.txt
cat ./resouces/proxy.txt >> temp-proxy.txt
cat ./resouces/direct.txt >> temp-direct.txt
# cat reject.txt >> temp-reject.txt
- name: Sort and generate redundant lists
@ -99,20 +97,20 @@ jobs:
- name: Remove redundant domains
run: |
chmod +x *.py
python ./findRedundantDomain.py ./direct-list-with-redundant ./direct-list-deleted-unsort
python ./findRedundantDomain.py ./proxy-list-with-redundant ./proxy-list-deleted-unsort
chmod +x ./resouces/*.py
python ./resouces/findRedundantDomain.py ./direct-list-with-redundant ./direct-list-deleted-unsort
python ./resouces/findRedundantDomain.py ./proxy-list-with-redundant ./proxy-list-deleted-unsort
[ ! -f "direct-list-deleted-unsort" ] && touch direct-list-deleted-unsort
[ ! -f "proxy-list-deleted-unsort" ] && touch proxy-list-deleted-unsort
sort ./direct-list-deleted-unsort > ./direct-list-deleted-sort
sort ./proxy-list-deleted-unsort > ./proxy-list-deleted-sort
python ./removeFrom.py -remove ./direct-list-deleted-sort -from ./direct-list-with-redundant -out direct-list-without-redundant
python ./removeFrom.py -remove ./proxy-list-deleted-sort -from ./proxy-list-with-redundant -out proxy-list-without-redundant
python ./resouces/removeFrom.py -remove ./direct-list-deleted-sort -from ./direct-list-with-redundant -out direct-list-without-redundant
python ./resouces/removeFrom.py -remove ./proxy-list-deleted-sort -from ./proxy-list-with-redundant -out proxy-list-without-redundant
- name: Remove domains from "need-to-remove" lists in "hidden" branch
run: |
python ./removeFrom.py -remove ./direct-need-to-remove.txt -from ./direct-list-without-redundant -out temp-cn.txt
python ./removeFrom.py -remove ./proxy-need-to-remove.txt -from ./proxy-list-without-redundant -out temp-geolocation-\!cn.txt
python ./resouces/removeFrom.py -remove ./resouces/direct-need-to-remove.txt -from ./direct-list-without-redundant -out ./temp-cn.txt
python ./resouces/removeFrom.py -remove ./resouces/proxy-need-to-remove.txt -from ./proxy-list-without-redundant -out ./temp-geolocation-\!cn.txt
- name: Remove domains end with ".cn" in "temp-geolocation-!cn.txt" and write lists to data directory
run: |

View File

@ -0,0 +1,35 @@
103.com
123cha.com
95081.com
airasia.com
baid.us
baidu.jp
bussou.com
busytrade.com
cnbeta.com
cnbetacdn.com
cnpolitics.org
dm530.net
duanzhihu.com
dysfz.cc
emacs-china.org
galaxymacau.com
galstars.net
haitum.com
hostloc.com
jiaoyou8.com
kh.google.com
laonanren.com
mysinablog.com
ntrqq.com
nytlog.com
shuangtv.net
suppig.net
top
xclient.info
xjp.cc
yanghengjun.com
ydy.com
yslang.com
yysub.net
hamreus.com

0
resouces/direct.txt Normal file
View File

View File

@ -0,0 +1,67 @@
#!/usr/bin/env python3
import sys
print(sys.argv[1], sys.argv[2])
''' Find redundant items in domain lists.
e.g. 'bar.foo.com' is redundant for 'foo.com'.
'''
def load(list):
''' Parse conf file & Prepare data structure
Returns: [ ['abc', 'com'],
['bar', 'foo', 'com'],
... ]
'''
results = []
with open(list, 'r') as f:
for line in f.readlines():
line = line.strip()
if line == '' or line.startswith('#'):
continue
# A domain name is case-insensitive and
# consists of several labels, separated by a full stop
domain_labels = line.lower().split('.')
results.append(domain_labels)
# Sort results by domain labels' length
results.sort(key=len)
return results
def find(labelses, removedDomainFile):
''' Find redundant items by a tree of top-level domain label to sub-level.
`tree` is like { 'com': { 'foo: { 'bar': LEAF },
'abc': LEAF },
'org': ... }
'''
tree = {}
LEAF = 1
for labels in labelses:
domain = '.'.join(labels)
# Init root node as current node
node = tree
while len(labels) > 0:
label = labels.pop()
if label in node:
# If child node is a LEAF node,
# current domain must be an existed domain or a subdomain of an existed.
if node[label] == LEAF:
print(f"Redundant found: {domain} at {'.'.join(labels)}")
with open(removedDomainFile, "a") as f:
f.write(domain)
f.write("\n")
break
else:
# Create a leaf node if current label is last one
if len(labels) == 0:
node[label] = LEAF
# Create a branch node
else:
node[label] = {}
# Iterate to child node
node = node[label]
if __name__ == '__main__':
find(load(sys.argv[1]), sys.argv[2])

View File

@ -0,0 +1,3 @@
ifanr.com
weibo.com
www.baidu.com

2
resouces/proxy.txt Normal file
View File

@ -0,0 +1,2 @@
supertop.co
hk.chinamobile.com

View File

@ -0,0 +1,33 @@
4paradigm.com
addthis.com
addthisedge.com
alimama.alicdn.com
alimama.com
analytics.google.com
app.chat.xiaomi.net
bdtj.tagtic.cn
cdn.onesignal.com
click.discord.com
click.redditmail.com
ctrip.com
d.ifengimg.com
icons.mydrivers.com
img.alibaba.com
jav321.com
knet.cn
mail.tsinghua.edu.cn
mtalk.google.com
mx.technolutions.net
newrelic.com
offer.alibaba.com
pingjs.qq.com
qlogo.cn
resolver.msg.xiaomi.net
s.youtube.com
sf3-ttcdn-tos.pstatp.com
t.co
tagtic.cn
telegra.ph
tongji.baidu.com
tv.sohu.com
ue.yeyoucdn.com

0
resouces/reject.txt Normal file
View File

29
resouces/removeFrom.py Normal file
View File

@ -0,0 +1,29 @@
import argparse
def remove_domains(file_to_remove, file_to_remove_from, output_file):
with open(file_to_remove, "r") as f_remove, open(
file_to_remove_from, "r"
) as f_from:
domains_to_remove = set(line.strip() for line in f_remove)
all_domains = set(line.strip() for line in f_from)
remaining_domains = all_domains - domains_to_remove
with open(output_file, "w") as output:
output.write("\n".join(remaining_domains))
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Remove domains from a file.")
parser.add_argument(
"-remove", required=True, help="File containing domains to be removed"
)
parser.add_argument(
"-from", required=True, dest="from_file", help="File to remove domains from"
)
parser.add_argument("-out", required=True, help="Output file")
args = parser.parse_args()
remove_domains(args.remove, args.from_file, args.out)