-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkeywords.py
116 lines (106 loc) · 2.37 KB
/
keywords.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import collections
import json
import re
keywords = {
"semi-structured": [
"graph",
"graphs",
"rdf",
"social network",
"subgraph",
"sparql",
"temporal",
"stream",
"streams",
"semistructured",
"xml",
"json",
"xpath",
"xquery",
"spatial",
"key-value",
"document",
],
"core": [
"information",
"system",
"systems",
"transaction",
"query",
"querying",
"queries",
"query processing",
"query optimization",
"storage",
"index",
"indexing",
"data model",
"semantics",
"query language",
"algebra",
"calculus",
"deadlock",
"relational",
"search",
"management",
"data",
"database",
"databases",
],
"new": [
"machine learning",
"data science",
"visualization",
"human",
"interactive",
"user",
"crowdsourcing",
"p2p",
"integration",
"mining",
"analytics",
"linkage",
"skyline",
"top-k",
"responsible",
],
"performance": [
"performance",
"scalable",
"distributed",
"parallel",
"hardware",
"realtime",
"concurrency",
"scaling",
"multicore",
"benchmark",
"efficient",
],
}
paper_cats = {}
titles = {}
for line in open('scopus.json'):
jsonl = json.loads(line)
# Try to get the first ID of the first paper from this line
try:
eid = jsonl["search-results"]["entry"][0]["eid"]
except KeyError:
continue
paper_cats[eid] = set()
# Try to get the title of the paper
title = jsonl["search-results"]["entry"][0].get("dc:title").lower()
if not title:
continue
titles[eid] = title
# Check the paper for the given keywords corresponding to a category
for (cat, kws) in keywords.items():
for kw in kws:
if re.search(r"\b" + kw + r"\b", title):
paper_cats[eid].add(cat)
break
# Count the number of papers assigned to 1, 2, 3, ... categories
cat_counts = collections.Counter()
for (eid, p) in paper_cats.items():
cat_counts[len(p)] += 1
print(cat_counts)