Egor Bulychev, source{d}.
Egor Bulychev
source{d}
def nearest_neighbors(self, origin, k=10,
skipped_stop=0.99, throw=True):
# origin can be either a text query or an id
if isinstance(origin, (tuple, list):
words, weights = origin
weights = numpy.array(weights, dtype=numpy.float32)
index = None
avg = self._get_centroid(words, weights, force=True)
else:
index = origin
words, weights = self._get_vocabulary(index)
avg = self._get_centroid_by_index(index)
if avg is None:
raise ValueError(
"Too little vocabulary for %s: %d" % (index, len(words)))
def nearest_neighbors(self, origin, k=10,
skipped_stop=0.99, throw=True):
# origin can be either a text query or an id
if isinstance(origin, (tuple, list):
words, weights = origin
weights = numpy.array(weights, dtype=numpy.float32)
index = None
avg = self._get_centroid(words, weights, force=True)
else:
index = origin
words, weights = self._get_vocabulary(index)
avg = self._get_centroid_by_index(index)
if avg is None:
raise ValueError(
"Too little vocabulary for %s: %d" % (index, len(words)))
⬤ keywords
⬤ identifiers
⬤ literals
⬤ strings
⬤ comments
⬤ reserved
_tcp_socket_connect -> [tcp, socket, connect]
AuthenticationError -> [authentication, error]
set_visible -> [set, visible]
class ???:
def connect(self, dbname, user, password, host, port):
# ...
def query(self, sql):
# ...
def close(self):
# ...
class Database:
def connect(self, dbname, user, password, host, port):
# ...
def query(self, sql):
# ...
def close(self):
# ...
set_name -> [set, name]
SetName -> [set, name]
BUFFERFLAG_CODECCONFIG -> [bufferflag, codecconfig]
metamodelength -> [metamodelength]
BUFFERFLAG_CODECCONFIG -> [buffer, flag, codec, config]
metamodelength -> [meta, mode, length]
“send” - “receive” + “pop” = “push”
“database” - “query” + “tune” = “settings”
\( \begin{split} V_1 \Leftrightarrow & \,\texttt{"foo"} \\ \\ V_2 \Leftrightarrow & \,\texttt{"bar"} \\ \\ V_3 \Leftrightarrow & \,\texttt{"integrate"} \end{split} \)
\( distance(V_1, V_2) < distance(V_1, V_3) \)
\( distance(V_i, V_j) = \arccos \frac{V_i \cdot V_j}{\left\lVert V_i \right\rVert \left\lVert V_j \right\rVert} \)
class Database:
def connect(self, user, password, host, port):
self._tcp_socket_connect(host, port)
try:
self._authenticate(user, password)
except AuthenticationError as e:
self.socket.close()
raise e from None
_tcp_socket_connect -> [tcp, socket, connect]
AuthenticationError -> [authentication, error]
authentication, authenticate -> authenticate
class Database:
def connect(self, user, password, host, port):
self._tcp_socket_connect(host, port)
try:
self._authenticate(user, password)
except AuthenticationError as e:
self.socket.close()
raise e from None
database, connect2, user2, password2, host2, port2, tcp, socket2, authenticate2, error, close
class Database:
def connect(self, user, password, host, port):
self._tcp_socket_connect(host, port)
try:
self._authenticate(user, password)
except AuthenticationError as e:
self.socket.close()
raise e from None
connect2, user2, password2, host2, port2, tcp, socket2, authenticate2, error, close
class Database:
def connect(self, user, password, host, port):
self._tcp_socket_connect(host, port)
try:
self._authenticate(user, password)
except AuthenticationError as e:
self.socket.close()
raise e from None
connect, user, password, host, port
class Database:
def connect(self, user, password, host, port):
self._tcp_socket_connect(host, port)
try:
self._authenticate(user, password)
except AuthenticationError as e:
self.socket.close()
raise e from None
tcp, socket, connect, host, port
class Database:
def connect(self, user, password, host, port):
self._tcp_socket_connect(host, port)
try:
self._authenticate(user, password)
except AuthenticationError as e:
self.socket.close()
raise e from None
authenticate2, user, password, error, socket, close
class Database:
def connect(self, user, password, host, port):
self._tcp_socket_connect(host, port)
try:
self._authenticate(user, password)
except AuthenticationError as e:
self.socket.close()
raise e from None
authenticate, user, password
class Database:
def connect(self, user, password, host, port):
self._tcp_socket_connect(host, port)
try:
self._authenticate(user, password)
except AuthenticationError as e:
self.socket.close()
raise e from None
authenticate, error, socket, close
class Database:
def connect(self, user, password, host, port):
self._tcp_socket_connect(host, port)
try:
self._authenticate(user, password)
except AuthenticationError as e:
self.socket.close()
raise e from None
authenticate, error
class Database:
def connect(self, user, password, host, port):
self._tcp_socket_connect(host, port)
try:
self._authenticate(user, password)
except AuthenticationError as e:
self.socket.close()
raise e from None
socket, close
Stage | Time | Resources | Size on disk |
---|---|---|---|
Cloning 143k repos | 3 days | 20x2 cores, 256 GB RAM | 2.6 TB |
Dataset | 4 days | 20x2 cores, 256 GB RAM | 2TB (31GB in xz) |
fastprep | 2 days | 16x2 cores, 256 GB RAM | 20 GB |
Swivel | 14 hours | 2 Titan X'2016 + 2 1080Ti | 5.6 GB |
Let's interpret every repository as the weighted bag-of-words.
We calculate TF-IDF to weight the occurring identifiers.
tensorflow/tensorflow:
tfreturn 67.780249 oprequires 63.968142 doblas 63.714424 gputools 62.337396 tfassign 61.545926 opkernel 60.721556 sycl 57.064558 hlo 55.723587 libxsmm 54.820668 tfdisallow 53.666890
WMD evaluation is O(N3), becomes slow on N≈100.
This allows to avoid 95% WMD evaluations on average.
class ClasName:
@classmethod
def function_name(cls) -> str:
varyable_name = "Hello, I'm ClassName object!"
return varyable_name
funktion ➙ function
GetValu ➙ GetValue
str_lenght ➙ str_length
class ClassName:
@classmethod
def function_name(cls) -> str:
variable_name = "Hello, I'm ClassName object!"
return variable_name
(identifier, token, candidate)
:
label=1
if the candidate is the correct suggestionlabel=0
otherwiseFrom git history
Dev\Files | File 1 | File 2 | File 3 |
---|---|---|---|
Dev 1 | 0 | 0.1 | 0.8 |
Dev 2 | 0.4 | 0.1 | 0.5 |
From head revision
Files\Tokens | Token 1 | Token 2 | Token 3 |
---|---|---|---|
File 1 | 0 | 0.1 | 0.8 |
File 2 | 0.4 | 0.1 | 0.56 |
File 3 | 0.43 | 0.18 | 0.9 |
Result after multiplication
Dev\Token | Token 1 | Token 2 | Token 3 |
---|---|---|---|
Dev 1 | 0 | 0.1 | 0.8 |
Dev 2 | 0.4 | 0.1 | 0.5 |